Update Chromium to beta version 37.0.2062.68

Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca Reviewed-by: Andras Becsi <andras.becsi@digia.com>
author: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-08 14:30:41 +0200
committer: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-12 13:49:54 +0200
commit: ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree: 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp9/common
parent: 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)
110 files changed, 16771 insertions, 7396 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
index b1fd21bb61f..b1fd21bb61f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
index a13c0d04b83..a13c0d04b83 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
new file mode 100644
index 00000000000..d290d07531c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp9_idct32x32_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ;TODO(hkuang): put the following macros in a seperate
+    ;file so other idct function could also use them.
+    MACRO
+    LD_16x8          $src, $stride
+    vld1.8           {q8}, [$src], $stride
+    vld1.8           {q9}, [$src], $stride
+    vld1.8           {q10}, [$src], $stride
+    vld1.8           {q11}, [$src], $stride
+    vld1.8           {q12}, [$src], $stride
+    vld1.8           {q13}, [$src], $stride
+    vld1.8           {q14}, [$src], $stride
+    vld1.8           {q15}, [$src], $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8    $diff
+    vqadd.u8         q8, q8, $diff
+    vqadd.u8         q9, q9, $diff
+    vqadd.u8         q10, q10, $diff
+    vqadd.u8         q11, q11, $diff
+    vqadd.u8         q12, q12, $diff
+    vqadd.u8         q13, q13, $diff
+    vqadd.u8         q14, q14, $diff
+    vqadd.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8    $diff
+    vqsub.u8         q8, q8, $diff
+    vqsub.u8         q9, q9, $diff
+    vqsub.u8         q10, q10, $diff
+    vqsub.u8         q11, q11, $diff
+    vqsub.u8         q12, q12, $diff
+    vqsub.u8         q13, q13, $diff
+    vqsub.u8         q14, q14, $diff
+    vqsub.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    ST_16x8          $dst, $stride
+    vst1.8           {q8}, [$dst], $stride
+    vst1.8           {q9}, [$dst], $stride
+    vst1.8           {q10},[$dst], $stride
+    vst1.8           {q11},[$dst], $stride
+    vst1.8           {q12},[$dst], $stride
+    vst1.8           {q13},[$dst], $stride
+    vst1.8           {q14},[$dst], $stride
+    vst1.8           {q15},[$dst], $stride
+    MEND
+
+;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;                              int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+
+|vp9_idct32x32_1_add_neon| PROC
+    push             {lr}
+    pld              [r1]
+    add              r3, r1, #16               ; r3 dest + 16 for second loop
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asrs             r0, r0, #6                ; >> 6
+    bge              diff_positive_32_32
+
+diff_negative_32_32
+    neg              r0, r0
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_negative_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_negative_32_32_loop
+    pop              {pc}
+
+diff_positive_32_32
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_positive_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_positive_32_32_loop
+    pop              {pc}
+
+    ENDP             ; |vp9_idct32x32_1_add_neon|
+    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
index f00d0277f92..72e933eee96 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
@@ -72,7 +72,7 @@ cospi_31_64 EQU   804
     ;   reg1 = output[first_offset]
     ;   reg2 = output[second_offset]
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -88,7 +88,7 @@ cospi_31_64 EQU   804
     ;   output[first_offset] = reg1
     ;   output[second_offset] = reg2
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -242,7 +242,7 @@ cospi_31_64 EQU   804
     ; TODO(cd): have special case to re-use constants when they are similar for
     ;           consecutive butterflies
     ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/substractions before the multiplies.
+    ;           additions/subtractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
     mov             r8,  #$first_constant  & 0xFF00
@@ -260,7 +260,7 @@ cospi_31_64 EQU   804
     vmull.s16 q11, $regB, d31
     vmull.s16 q12, $regC, d31
     ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/substractions (to get back two register)
+    ; do some addition/subtractions (to get back two register)
     vsub.s32  q8, q8, q10
     vsub.s32  q9, q9, q11
     ; do more multiplications (ordered for maximum latency hiding)
@@ -268,7 +268,7 @@ cospi_31_64 EQU   804
     vmull.s16 q11, $regA, d30
     vmull.s16 q15, $regB, d30
     ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/substractions
+    ; do more addition/subtractions
     vadd.s32  q11, q12, q11
     vadd.s32  q10, q10, q15
     ; (used) four for intermediate (q8-q11)
@@ -1145,7 +1145,7 @@ idct32_bands_end_1st_pass
 
     ; pass loop processing
     add r5, r5, #1
-    B idct32_pass_loop
+    b idct32_pass_loop
 
 idct32_bands_end_2nd_pass
     STORE_COMBINE_CENTER_RESULTS
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
index 0d4a721c4d3..0d4a721c4d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
index 00283fc8d78..00283fc8d78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
index 421d202d403..421d202d403 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
index 54764008bdf..ab5bb69202a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
@@ -9,7 +9,7 @@
 ;
 
     EXPORT  |vp9_idct8x8_64_add_neon|
-    EXPORT  |vp9_idct8x8_10_add_neon|
+    EXPORT  |vp9_idct8x8_12_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -310,13 +310,13 @@
     bx              lr
     ENDP  ; |vp9_idct8x8_64_add_neon|
 
-;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_idct8x8_10_add_neon|
+    ENDP  ; |vp9_idct8x8_12_add_neon|
 
     END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
index 2f326e24c9e..2f326e24c9e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
index 93d3af3011c..b41f5661b80 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -576,6 +576,7 @@
     vld1.s16        {q14,q15}, [r0]!
 
     push            {r0-r10}
+    vpush           {d8-d15}
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -636,6 +637,7 @@ iadst_iadst
     IADST8X8_1D
 
 end_vp9_iht8x8_64_add_neon
+    vpop           {d8-d15}
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
new file mode 100644
index 00000000000..5b8ec20287d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -0,0 +1,199 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_lpf_horizontal_4_dual_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vp9_lpf_horizontal_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vp9_lpf_horizontal_4_dual_neon|
+
+; void vp9_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|vp9_loop_filter_neon_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon_16|
+
+    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
new file mode 100644
index 00000000000..0820db2477b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
index 8b4fe5dccf6..4430322171d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -8,10 +8,10 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_loop_filter_vertical_edge_neon|
-    EXPORT  |vp9_mbloop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_mbloop_filter_vertical_edge_neon|
+    EXPORT  |vp9_lpf_horizontal_4_neon|
+    EXPORT  |vp9_lpf_vertical_4_neon|
+    EXPORT  |vp9_lpf_horizontal_8_neon|
+    EXPORT  |vp9_lpf_vertical_8_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -21,12 +21,12 @@
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
-;                                           int p /* pitch */,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
@@ -34,7 +34,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_loop_filter_horizontal_edge_neon| PROC
+|vp9_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -77,19 +77,19 @@ count_lf_h_loop
 
 end_vp9_lf_h_edge
     pop         {pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
+    ENDP        ; |vp9_lpf_horizontal_4_neon|
 
 ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
-;                                         int p /* pitch */,
-;                                         const uint8_t *blimit,
-;                                         const uint8_t *limit,
-;                                         const uint8_t *thresh,
-;                                         int count)
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
@@ -97,7 +97,7 @@ end_vp9_lf_h_edge
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_loop_filter_vertical_edge_neon| PROC
+|vp9_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -158,7 +158,7 @@ count_lf_v_loop
 
 end_vp9_lf_v_edge
     pop         {pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
+    ENDP        ; |vp9_lpf_vertical_4_neon|
 
 ; void vp9_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
@@ -276,18 +276,18 @@ end_vp9_lf_v_edge
     bx          lr
     ENDP        ; |vp9_loop_filter_neon|
 
-; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
-;                                             const uint8_t *blimit,
-;                                             const uint8_t *limit,
-;                                             const uint8_t *thresh,
-;                                             int count)
+; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_mbloop_filter_horizontal_edge_neon| PROC
+|vp9_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -333,14 +333,14 @@ count_mblf_h_loop
 end_vp9_mblf_h_edge
     pop         {r4-r5, pc}
 
-    ENDP        ; |vp9_mbloop_filter_horizontal_edge_neon|
+    ENDP        ; |vp9_lpf_horizontal_8_neon|
 
-; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
-;                                           int pitch,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
+; void vp9_lpf_vertical_8_neon(uint8_t *s,
+;                              int pitch,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
 ;
 ; r0    uint8_t *s,
 ; r1    int pitch,
@@ -348,7 +348,7 @@ end_vp9_mblf_h_edge
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_mbloop_filter_vertical_edge_neon| PROC
+|vp9_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -420,7 +420,7 @@ count_mblf_v_loop
 
 end_vp9_mblf_v_edge
     pop         {r4-r5, pc}
-    ENDP        ; |vp9_mbloop_filter_vertical_edge_neon|
+    ENDP        ; |vp9_lpf_vertical_8_neon|
 
 ; void vp9_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
index 2e8001b918b..5fe2bba4644 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -8,23 +8,23 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
-    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
+    EXPORT  |vp9_lpf_horizontal_16_neon|
+    EXPORT  |vp9_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh
-;                                        int count)
+; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+;                                 const uint8_t *blimit,
+;                                 const uint8_t *limit,
+;                                 const uint8_t *thresh
+;                                 int count)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+|vp9_lpf_horizontal_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -115,18 +115,18 @@ h_next
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
+    ENDP        ; |vp9_lpf_horizontal_16_neon|
 
-; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh)
+; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+;                               const uint8_t *blimit,
+;                               const uint8_t *limit,
+;                               const uint8_t *thresh)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_mb_lpf_vertical_edge_w_neon| PROC
+|vp9_lpf_vertical_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -279,7 +279,7 @@ v_end
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
+    ENDP        ; |vp9_lpf_vertical_16_neon|
 
 ; void vp9_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
@@ -439,6 +439,9 @@ v_end
     tst         r7, #1
     bxne        lr
 
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
     ; mbfilter flat && mask branch
     ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
     ; and using vibt on the q's?
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
new file mode 100644
index 00000000000..dc9856fa887
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -0,0 +1,634 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_v_predictor_4x4_neon|
+    EXPORT  |vp9_v_predictor_8x8_neon|
+    EXPORT  |vp9_v_predictor_16x16_neon|
+    EXPORT  |vp9_v_predictor_32x32_neon|
+    EXPORT  |vp9_h_predictor_4x4_neon|
+    EXPORT  |vp9_h_predictor_8x8_neon|
+    EXPORT  |vp9_h_predictor_16x16_neon|
+    EXPORT  |vp9_h_predictor_32x32_neon|
+    EXPORT  |vp9_tm_predictor_4x4_neon|
+    EXPORT  |vp9_tm_predictor_8x8_neon|
+    EXPORT  |vp9_tm_predictor_16x16_neon|
+    EXPORT  |vp9_tm_predictor_32x32_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_v_predictor_4x4_neon| PROC
+    vld1.32             {d0[0]}, [r2]
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_v_predictor_4x4_neon|
+
+;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_v_predictor_8x8_neon| PROC
+    vld1.8              {d0}, [r2]
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_v_predictor_8x8_neon|
+
+;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_v_predictor_16x16_neon| PROC
+    vld1.8              {q0}, [r2]
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_v_predictor_16x16_neon|
+
+;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_v_predictor_32x32_neon| PROC
+    vld1.8              {q0, q1}, [r2]
+    mov                 r2, #2
+loop_v
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_v
+    bx                  lr
+    ENDP                ; |vp9_v_predictor_32x32_neon|
+
+;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_h_predictor_4x4_neon| PROC
+    vld1.32             {d1[0]}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_h_predictor_4x4_neon|
+
+;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_h_predictor_8x8_neon| PROC
+    vld1.64             {d1}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[4]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[5]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[6]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[7]
+    vst1.64             {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_h_predictor_8x8_neon|
+
+;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_h_predictor_16x16_neon| PROC
+    vld1.8              {q1}, [r3]
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_h_predictor_16x16_neon|
+
+;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_h_predictor_32x32_neon| PROC
+    sub                 r1, r1, #16
+    mov                 r2, #2
+loop_h
+    vld1.8              {q1}, [r3]!
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_h
+    bx                  lr
+    ENDP                ; |vp9_h_predictor_32x32_neon|
+
+;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_4x4_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             d0, r12
+
+    ; Load above 4 pixels
+    vld1.32             {d2[0]}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+
+    ; 3rd row and 4th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_4x4_neon|
+
+;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_8x8_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             d0, r12
+
+    ; preload 8 left
+    vld1.8              {d30}, [r3]
+
+    ; Load above 8 pixels
+    vld1.64             {d2}, [r2]
+
+    vmovl.u8            q10, d30
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 3rd row and 4th row
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    ; 5th row and 6th row
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_8x8_neon|
+
+;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_16x16_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             q0, r12
+
+    ; Load above 8 pixels
+    vld1.8              {q1}, [r2]
+
+    ; preload 8 left into r12
+    vld1.8              {d18}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q2, d2, d0
+    vsubl.u8            q3, d3, d1
+
+    vmovl.u8            q10, d18
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+    mov                 r2, #2
+
+loop_16x16_neon
+    ; Process two rows.
+    vdup.16             q0, d20[0]
+    vdup.16             q8, d20[1]
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d20[2]                  ; proload next 2 rows data
+    vdup.16             q8, d20[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[0]                  ; proload next 2 rows data
+    vdup.16             q8, d21[1]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[2]                  ; proload next 2 rows data
+    vdup.16             q8, d21[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
+    vmovl.u8            q10, d18
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_16x16_neon
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_16x16_neon|
+
+;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                  const uint8_t *above,
+;                                  const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_32x32_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             q0, r12
+
+    ; Load above 32 pixels
+    vld1.8              {q1}, [r2]!
+    vld1.8              {q2}, [r2]
+
+    ; preload 8 left pixels
+    vld1.8              {d26}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q8, d2, d0
+    vsubl.u8            q9, d3, d1
+    vsubl.u8            q10, d4, d0
+    vsubl.u8            q11, d5, d1
+
+    vmovl.u8            q3, d26
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+    mov                 r2, #4
+
+loop_32x32_neon
+    ; Process two rows.
+    vdup.16             q0, d6[0]
+    vdup.16             q2, d6[1]
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q1, d6[2]
+    vdup.16             q2, d6[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q1, q8
+    vadd.s16            q13, q1, q9
+    vadd.s16            q14, q1, q10
+    vadd.s16            q15, q1, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[0]
+    vdup.16             q2, d7[1]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[2]
+    vdup.16             q2, d7[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vmovl.u8            q3, d0
+    vst1.64             {d24-d27}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_32x32_neon
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_32x32_neon|
+
+    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/generic/vp9_systemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/generic/vp9_systemdependent.c
deleted file mode 100644
index 536febb6522..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/generic/vp9_systemdependent.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
-  vp9_rtcd();
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 644264f656a..6ebea9f2f43 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_
-#define VP9_COMMON_VP9_COMMON_DSPR2_H_
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
 
 #include <assert.h>
 
@@ -17,6 +17,10 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #if HAVE_DSPR2
 #define CROP_WIDTH 512
 extern uint8_t *vp9_ff_cropTbl;
@@ -81,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
   );
 }
 
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride);
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
 
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -114,4 +118,8 @@ void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               int w, int h);
 
 #endif  // #if HAVE_DSPR2
-#endif  // VP9_COMMON_VP9_COMMON_DSPR2_H_
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
new file mode 100644
index 00000000000..b0dc496aebf
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
@@ -0,0 +1,332 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
+        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
+        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
+        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
+        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, left2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r" (left1), [above1] "=&r" (above1),
+        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
+        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
+        [above2] "=&r" (above2), [left2] "=&r" (left2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
new file mode 100644
index 00000000000..a53c62381ce
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__ (
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
+        [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
+        [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  abovel, abover;
+  int32_t  left0, left1, left2, left3;
+  int32_t  res0, res1;
+  int32_t  resl;
+  int32_t  resr;
+  int32_t  top_left;
+  uint8_t  *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[resl],       (%[above])                         \n\t"
+
+      "lbu             %[left0],       (%[left])                         \n\t"
+      "lbu             %[left1],       1(%[left])                        \n\t"
+      "lbu             %[left2],       2(%[left])                        \n\t"
+      "lbu             %[left3],       3(%[left])                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                      \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[resl]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[resl]                           \n\t"
+
+      "replv.ph        %[left0],       %[left0]                          \n\t"
+      "replv.ph        %[left1],       %[left1]                          \n\t"
+      "replv.ph        %[left2],       %[left2]                          \n\t"
+      "replv.ph        %[left3],       %[left3]                          \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                       \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left0]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left0]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left1]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left1]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left2]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left2]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],        %[left3]        \n\t"
+      "subu.ph         %[resl],        %[resl],          %[top_left]     \n\t"
+
+      "addu.ph         %[resr],        %[abover],        %[left3]        \n\t"
+      "subu.ph         %[resr],        %[resr],          %[top_left]     \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],          %[stride]       \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
+        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
+        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
new file mode 100644
index 00000000000..40d93ae3506
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
@@ -0,0 +1,610 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
+      : [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
+        [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
+        [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
+        [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
+        [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
+        [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t   abovel, abover;
+  int32_t   abovel_1, abover_1;
+  int32_t   left0;
+  int32_t   res0, res1, res2, res3;
+  int32_t   reshw;
+  int32_t   top_left;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[reshw],       (%[above])                         \n\t"
+      "ulw             %[top_left],    4(%[above])                        \n\t"
+
+      "lbu             %[left0],       (%[left])                          \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[reshw]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[reshw]                           \n\t"
+      "preceu.ph.qbl   %[abovel_1],    %[top_left]                        \n\t"
+      "preceu.ph.qbr   %[abover_1],    %[top_left]                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                       \n\t"
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                        \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       1(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       2(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       3(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       4(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       5(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       6(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       7(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
+        [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
+        [res0] "=&r" (res0), [res1] "=&r" (res1),
+        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 1b2f5506a9a..19c582fd109 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_10, step1_11, step1_12, step1_13;
@@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                         int dest_stride) {
+static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct16_1d_rows_dspr2(input, out, 16);
+  idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct16_1d_rows_dspr2(input, outptr, 16);
-      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_rows_dspr2(input, outptr, 16);
 
       outptr = out;
 
       for (i = 0; i < 16; ++i) {
-        iadst16_1d(outptr, temp_out);
+        iadst16(outptr, temp_out);
 
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
@@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 16; ++j)
             temp_in[j * 16 + i] = out[i * 16 + j];
 
-      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
     }
     break;
     case ADST_ADST:   // ADST in both directions
@@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 16; ++i) {
         for (j = 0; j < 16; ++j)
           temp_in[j] = out[j * 16 + i];
-        iadst16_1d(temp_in, temp_out);
+        iadst16(temp_in, temp_out);
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
@@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_1d_rows_dspr2(input, outptr, 4);
+  idct16_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
   for (i = 0; i < 6; ++i) {
@@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Then transform columns
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 5e92db3d289..132d88ce5f7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -18,8 +18,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index d3aee73cbc2..74a90b02caa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,7 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -42,7 +43,7 @@ static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
 
-  for (i = 32; i--; ) {
+  for (i = no_rows; i--; ) {
     input_int = (const int32_t *)input;
 
     if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
@@ -881,10 +882,72 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr);
+  idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_rows_dspr2(input, outptr, 8);
+
+  outptr += 8;
+  __asm__ __volatile__ (
+      "sw     $zero,      0(%[outptr])     \n\t"
+      "sw     $zero,      4(%[outptr])     \n\t"
+      "sw     $zero,      8(%[outptr])     \n\t"
+      "sw     $zero,     12(%[outptr])     \n\t"
+      "sw     $zero,     16(%[outptr])     \n\t"
+      "sw     $zero,     20(%[outptr])     \n\t"
+      "sw     $zero,     24(%[outptr])     \n\t"
+      "sw     $zero,     28(%[outptr])     \n\t"
+      "sw     $zero,     32(%[outptr])     \n\t"
+      "sw     $zero,     36(%[outptr])     \n\t"
+      "sw     $zero,     40(%[outptr])     \n\t"
+      "sw     $zero,     44(%[outptr])     \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+  for (i = 0; i < 31; ++i) {
+    outptr += 32;
+
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[outptr])     \n\t"
+        "sw     $zero,      4(%[outptr])     \n\t"
+        "sw     $zero,      8(%[outptr])     \n\t"
+        "sw     $zero,     12(%[outptr])     \n\t"
+        "sw     $zero,     16(%[outptr])     \n\t"
+        "sw     $zero,     20(%[outptr])     \n\t"
+        "sw     $zero,     24(%[outptr])     \n\t"
+        "sw     $zero,     28(%[outptr])     \n\t"
+        "sw     $zero,     32(%[outptr])     \n\t"
+        "sw     $zero,     36(%[outptr])     \n\t"
+        "sw     $zero,     40(%[outptr])     \n\t"
+        "sw     $zero,     44(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+  }
+
+  // Columns
+  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 5b7aa5e71ea..1990348b83a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -19,7 +19,7 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
@@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
   }
 }
 
-static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                                int dest_stride) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
@@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  vp9_idct4_1d_rows_dspr2(input, outptr);
+  vp9_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 }
 
-static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst4_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3;
 
@@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:   // DCT in both horizontal and vertical
-      vp9_idct4_1d_rows_dspr2(input, outptr);
-      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vp9_idct4_rows_dspr2(input, outptr);
+      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
-      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_rows_dspr2(input, outptr);
 
       outptr = out;
 
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(outptr, temp_out);
+        iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
@@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 4; ++i) {
         for (j = 0; j < 4; ++j)
           temp_in[j] = out[j * 4 + i];
-        iadst4_1d_dspr2(temp_in, temp_out);
+        iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 93a08401d43..fc44ffa311d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                uint32_t no_rows) {
+static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
+                             uint32_t no_rows) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   const int const_2_power_13 = 8192;
   int Temp0, Temp1, Temp2, Temp3, Temp4;
@@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                           int dest_stride) {
+static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                        int dest_stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
@@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 8);
+  idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
-static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst8_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3, x4, x5, x6, x7;
 
@@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct8_1d_rows_dspr2(input, outptr, 8);
-      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_rows_dspr2(input, outptr, 8);
 
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(&out[i * 8], temp_out);
+        iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:    // DCT in vertical, ADST in horizontal
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:   // ADST in both directions
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 8; ++j)
           temp_in[j] = out[j * 8 + i];
 
-        iadst8_1d_dspr2(temp_in, temp_out);
+        iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
                               int dest_stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
@@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 4);
+  idct8_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
 
@@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
new file mode 100644
index 00000000000..3df7f4c9fe5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -0,0 +1,362 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_lpf_horizontal_4_dspr2(unsigned char *s,
+                                int pitch,
+                                const uint8_t *blimit,
+                                const uint8_t *limit,
+                                const uint8_t *thresh,
+                                int count) {
+  uint8_t   i;
+  uint32_t  mask;
+  uint32_t  hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__ (
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
+            [p6] "=&r" (p6)
+          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
+      );
+
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                                pm1, p0, p3, p4, p5, p6,
+                                thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__ (
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
+              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_lpf_vertical_4_dspr2(unsigned char *s,
+                              int pitch,
+                              const uint8_t *blimit,
+                              const uint8_t *limit,
+                              const uint8_t *thresh,
+                              int count) {
+  uint8_t   i;
+  uint32_t  mask, hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                                p0, p3, p4, p5, p6, thresh_vec,
+                                &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r" (p1)
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+
+void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+                                     const uint8_t *blimit0,
+                                     const uint8_t *limit0,
+                                     const uint8_t *thresh0,
+                                     const uint8_t *blimit1,
+                                     const uint8_t *limit1,
+                                     const uint8_t *thresh1) {
+  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+                                     const uint8_t *blimit0,
+                                     const uint8_t *limit0,
+                                     const uint8_t *thresh0,
+                                     const uint8_t *blimit1,
+                                     const uint8_t *limit1,
+                                     const uint8_t *thresh1) {
+  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+                                   const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+                                   const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
new file mode 100644
index 00000000000..008cf8cacd9
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
@@ -0,0 +1,763 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
+                                    uint32_t *ps1, uint32_t *ps0,
+                                    uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+        [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
+                                     uint32_t ps1, uint32_t ps0,
+                                     uint32_t qs0, uint32_t qs1,
+                                     uint32_t *p1_f0, uint32_t *p0_f0,
+                                     uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                      uint32_t *op1, uint32_t *op0,
+                                      uint32_t *oq0, uint32_t *oq1,
+                                      uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t       res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2;
+  uint32_t       tmp;
+  uint32_t       add_p210_q012;
+  uint32_t       u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012),
+        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
+        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
+        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
+        [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                       uint32_t p1, uint32_t p0,
+                                       uint32_t q0, uint32_t q1,
+                                       uint32_t q2, uint32_t q3,
+                                       uint32_t *op2_f1,
+                                       uint32_t *op1_f1, uint32_t *op0_f1,
+                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                       uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t  res_op2, res_op1, res_op0;
+  uint32_t  res_oq0, res_oq1, res_oq2;
+  uint32_t  tmp;
+  uint32_t  add_p210_q012;
+  uint32_t  u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
+        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                           uint32_t *op5, uint32_t *op4,
+                                           uint32_t *op3, uint32_t *op2,
+                                           uint32_t *op1, uint32_t *op0,
+                                           uint32_t *oq0, uint32_t *oq1,
+                                           uint32_t *oq2, uint32_t *oq3,
+                                           uint32_t *oq4, uint32_t *oq5,
+                                           uint32_t *oq6, uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t       tmp;
+  uint32_t       add_p6toq6;
+  uint32_t       u32Eight = 0x00080008;
+
+  __asm__ __volatile__ (
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r" (add_p6toq6)
+      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
+        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [u32Eight] "r" (u32Eight)
+  );
+
+  __asm__ __volatile__ (
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
+        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
+      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q2] "r" (q2), [q1] "r" (q1),
+        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__ (
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
+        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
+        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
+        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
+      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
+        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+        [p1] "r" (p1), [p2] "r" (p2),
+        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
new file mode 100644
index 00000000000..ca01a6a1030
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
@@ -0,0 +1,478 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                   \
+                                                                        \
+        : [p1_f0] "+r" (p1_f0)                                          \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F1() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+                                                                        \
+        : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r),   \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                   \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                   \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                   \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                   \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                   \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                   \
+                                                                        \
+        : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l),   \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F2() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s4])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s4])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s4])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s4])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"                   \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"                   \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"                   \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"                   \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"                   \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"                   \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"                   \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"                   \
+                                                                        \
+        : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r),   \
+          [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r),   \
+          [q0_r] "+r" (q0_r),                                           \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
+          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
+          [p6_r] "+r" (p6_r)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s3])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s3])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s3])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s3])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s2])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s2])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s2])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s2])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"                    \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"                    \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"                    \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"                    \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"                    \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"                    \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"                    \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"                    \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"                    \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"                    \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"                    \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"                    \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"                    \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"                    \
+                                                                        \
+        : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l),   \
+          [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l),   \
+          [q0_l] "+r" (q0_l),                                           \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
+          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
+          [p6_l] "+r" (p6_l)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s1])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s1])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s1])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s1])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define PACK_LEFT_0TO3() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                       \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                       \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                       \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                       \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                       \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                       \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                       \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
+          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
+          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
+          [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_LEFT_4TO7() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                       \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                       \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                       \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                       \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                       \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                       \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                       \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
+          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
+          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
+          [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_0TO3() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                        \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                       \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                       \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                       \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                       \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                       \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                       \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
+          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
+          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
+          [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_4TO7() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                       \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                       \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                       \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                       \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                       \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                       \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                       \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
+          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
+          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
+          [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"            \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"            \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"            \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"            \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"            \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"            \
+                                                                        \
+        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
+        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
+          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
+          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
+          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
+          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
+          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"            \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"            \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"            \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"            \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"            \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"            \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"            \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"            \
+                                                                        \
+        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
+          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
+          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
+          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
+        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
+          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
+          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
+          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
+          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
+          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
+          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
+          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
+    );                                                                  \
+}
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
new file mode 100644
index 00000000000..5b0d9cc9b94
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                             uint32_t p1, uint32_t p0,
+                                             uint32_t p3, uint32_t p2,
+                                             uint32_t q0, uint32_t q1,
+                                             uint32_t q2, uint32_t q3,
+                                             uint32_t thresh, uint32_t *hev,
+                                             uint32_t *mask) {
+  uint32_t  c, r, r3, r_k;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  hev1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k),
+        [r] "=&r" (r), [r3] "=&r" (r3)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                       uint32_t flimit,
+                                                       uint32_t thresh,
+                                                       uint32_t p1, uint32_t p0,
+                                                       uint32_t p3, uint32_t p2,
+                                                       uint32_t q0, uint32_t q1,
+                                                       uint32_t q2, uint32_t q3,
+                                                       uint32_t *hev,
+                                                       uint32_t *mask,
+                                                       uint32_t *flat) {
+  uint32_t  c, r, r3, r_k, r_flat;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  hev1;
+  uint32_t  flat1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
+                                 uint32_t p2, uint32_t p1,
+                                 uint32_t p0, uint32_t q0,
+                                 uint32_t q1, uint32_t q2,
+                                 uint32_t q3, uint32_t q4,
+                                 uint32_t *flat2) {
+  uint32_t  c, r, r_k, r_flat;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  flat1, flat3;
+
+  __asm__ __volatile__ (
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), 
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
+      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
+        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
new file mode 100644
index 00000000000..7cd0b632bf7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
@@ -0,0 +1,652 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_lpf_horizontal_8_dspr2(unsigned char *s,
+                                int pitch,
+                                const uint8_t *blimit,
+                                const uint8_t *limit,
+                                const uint8_t *thresh,
+                                int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat;
+  uint8_t   i;
+  uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_lpf_vertical_8_dspr2(unsigned char *s,
+                              int pitch,
+                              const uint8_t *blimit,
+                              const uint8_t *limit,
+                              const uint8_t *thresh,
+                              int count) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb         %[p2_l],  -3(%[s2])    \n\t"
+          "sb         %[p1_l],  -2(%[s2])    \n\t"
+          "sb         %[p0_l],  -1(%[s2])    \n\t"
+          "sb         %[q0_l],    (%[s2])    \n\t"
+          "sb         %[q1_l],  +1(%[s2])    \n\t"
+          "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
new file mode 100644
index 00000000000..6c946742e90
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
@@ -0,0 +1,795 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
+                                 int pitch,
+                                 const uint8_t *blimit,
+                                 const uint8_t *limit,
+                                 const uint8_t *thresh,
+                                 int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat, flat2;
+  uint8_t   i;
+  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
+          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
+          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__ (
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
+            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+      );
+
+      __asm__ __volatile__ (
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
+            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
+            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+      );
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
+              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
+              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
+            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
+              [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
+              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
+              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
new file mode 100644
index 00000000000..851fc6c2d7e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
@@ -0,0 +1,840 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_lpf_vertical_16_dspr2(uint8_t *s,
+                               int pitch,
+                               const uint8_t *blimit,
+                               const uint8_t *limit,
+                               const uint8_t *thresh) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat, flat2;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
+          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
+          [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
+          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
+          [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb       %[p2_l],    -3(%[s2])    \n\t"
+          "sb       %[p1_l],    -2(%[s2])    \n\t"
+          "sb       %[p0_l],    -1(%[s2])    \n\t"
+          "sb       %[q0_l],      (%[s2])    \n\t"
+          "sb       %[q1_l],    +1(%[s2])    \n\t"
+          "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
+              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
+              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s4] "r" (s4)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
+            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
+            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
+            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
+            [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s3] "r" (s3)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [s2] "r" (s2)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l),
+              [s1] "r" (s1)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
index 0d65651f087..f44ada1b9c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
@@ -8,136 +8,208 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
-#include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
-  const int stride = cm->mode_info_stride;
+static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
   int i;
 
-  // Clear down top border row
-  vpx_memset(mi, 0, sizeof(MODE_INFO) * stride);
+  // Top border row
+  vpx_memset(mi, 0, sizeof(*mi) * cm->mi_stride);
 
-  // Clear left border column
-  for (i = 1; i < cm->mi_rows + 1; i++)
-    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
+  // Left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi));
 }
 
-void vp9_free_frame_buffers(VP9_COMMON *cm) {
-  int i;
+static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE;
 
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&cm->yv12_fb[i]);
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+}
 
-  vp9_free_frame_buffer(&cm->post_proc_buffer);
+static void setup_mi(VP9_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+
+  vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
+                                      sizeof(*cm->mi_grid_base));
+
+  clear_mi_border(cm, cm->prev_mip);
+}
+
+static int alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (cm->mip == NULL)
+    return 1;
+
+  cm->prev_mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (cm->prev_mip == NULL)
+    return 1;
+
+  cm->mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (cm->mi_grid_base == NULL)
+    return 1;
+
+  cm->prev_mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (cm->prev_mi_grid_base == NULL)
+    return 1;
+
+  return 0;
+}
 
+static void free_mi(VP9_COMMON *cm) {
   vpx_free(cm->mip);
   vpx_free(cm->prev_mip);
-  vpx_free(cm->last_frame_seg_map);
   vpx_free(cm->mi_grid_base);
   vpx_free(cm->prev_mi_grid_base);
 
   cm->mip = NULL;
   cm->prev_mip = NULL;
-  cm->last_frame_seg_map = NULL;
   cm->mi_grid_base = NULL;
   cm->prev_mi_grid_base = NULL;
 }
 
-static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
-  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
+  int i;
 
-  cm->mb_cols = (cm->mi_cols + 1) >> 1;
-  cm->mb_rows = (cm->mi_rows + 1) >> 1;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
+
+    if (cm->frame_bufs[i].ref_count > 0 &&
+        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
+      cm->frame_bufs[i].ref_count = 0;
+    }
+  }
+
+  vp9_free_frame_buffer(&cm->post_proc_buffer);
+
+  free_mi(cm);
+
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = NULL;
+
+  vpx_free(cm->above_context);
+  cm->above_context = NULL;
+
+  vpx_free(cm->above_seg_context);
+  cm->above_seg_context = NULL;
 }
 
-static void setup_mi(VP9_COMMON *cm) {
-  cm->mi = cm->mip + cm->mode_info_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
+
+  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+    goto fail;
+
+  set_mb_mi(cm, aligned_width, aligned_height);
+
+  free_mi(cm);
+  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
+    goto fail;
+
+  setup_mi(cm);
 
-  vpx_memset(cm->mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+  // Create the segmentation map structure and set to 0.
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
+    goto fail;
 
-  vpx_memset(cm->mi_grid_base, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) *
-             sizeof(*cm->mi_grid_base));
+  vpx_free(cm->above_context);
+  cm->above_context =
+      (ENTROPY_CONTEXT *)vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+                                        MAX_MB_PLANE,
+                                    sizeof(*cm->above_context));
+  if (!cm->above_context)
+    goto fail;
 
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vpx_free(cm->above_seg_context);
+  cm->above_seg_context =
+     (PARTITION_CONTEXT *)vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+                                     sizeof(*cm->above_seg_context));
+  if (!cm->above_seg_context)
+    goto fail;
+
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
 }
 
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i;
-
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
   const int ss_x = cm->subsampling_x;
   const int ss_y = cm->subsampling_y;
-  int mi_size;
+  int i;
 
   vp9_free_frame_buffers(cm);
 
-  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    cm->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
-                               VP9BORDERINPIXELS) < 0)
+  for (i = 0; i < FRAME_BUFFERS; i++) {
+    cm->frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
+                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
-  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
-
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    cm->active_ref_idx[i] = i;
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
 
-  for (i = 0; i < NUM_REF_FRAMES; i++) {
+  for (i = 0; i < REF_FRAMES; i++) {
     cm->ref_frame_map[i] = i;
-    cm->fb_idx_ref_cnt[i] = 1;
+    cm->frame_bufs[i].ref_count = 1;
   }
 
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                             VP9BORDERINPIXELS) < 0)
+                             VP9_ENC_BORDER_IN_PIXELS) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
 
-  // Allocation
-  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
-
-  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!cm->mip)
+  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
     goto fail;
 
-  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!cm->prev_mip)
-    goto fail;
+  setup_mi(cm);
 
-  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-  if (!cm->mi_grid_base)
+  // Create the segmentation map structure and set to 0.
+  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
     goto fail;
 
-  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
-  if (!cm->prev_mi_grid_base)
+  cm->above_context =
+      (ENTROPY_CONTEXT *)vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+                                        MAX_MB_PLANE,
+                                    sizeof(*cm->above_context));
+  if (!cm->above_context)
     goto fail;
 
-  setup_mi(cm);
-
-  // Create the segmentation map structure and set to 0.
-  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map)
+  cm->above_seg_context =
+      (PARTITION_CONTEXT *)vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+                                      sizeof(*cm->above_seg_context));
+  if (!cm->above_seg_context)
     goto fail;
 
   return 0;
@@ -147,22 +219,9 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   return 1;
 }
 
-void vp9_create_common(VP9_COMMON *cm) {
-  vp9_machine_specific_config(cm);
-
-  cm->tx_mode = ONLY_4X4;
-  cm->comp_pred_mode = HYBRID_PREDICTION;
-}
-
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
-}
-
-void vp9_initialize_common() {
-  vp9_init_neighbors();
-  vp9_coef_tree_initialize();
-  vp9_entropy_mode_init();
-  vp9_entropy_mv_init();
+  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
@@ -176,3 +235,19 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
+
+void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->prev_mip;
+  MODE_INFO **temp2 = cm->prev_mi_grid_base;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp2;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
index 5d5fae99306..06636a905b2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
@@ -12,19 +12,26 @@
 #ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
 #define VP9_COMMON_VP9_ALLOCCOMMON_H_
 
-#include "vp9/common/vp9_onyxc_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-void vp9_initialize_common();
+struct VP9Common;
 
-void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
+void vp9_remove_common(struct VP9Common *cm);
 
-void vp9_create_common(VP9_COMMON *cm);
-void vp9_remove_common(VP9_COMMON *cm);
+int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height);
 
-int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *cm);
+int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height);
 
+void vp9_free_frame_buffers(struct VP9Common *cm);
 
-void vp9_update_frame_size(VP9_COMMON *cm);
+void vp9_update_frame_size(struct VP9Common *cm);
+
+void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
new file mode 100644
index 00000000000..43d6c6ef560
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
@@ -0,0 +1,155 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_blockd.h"
+
+PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
+      return DC_PRED;
+
+    return get_y_mode(left_mi, b + 1);
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
+      return DC_PRED;
+
+    return get_y_mode(above_mi, b + 2);
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
+void vp9_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
+  int i;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    int r, c;
+
+    int max_blocks_wide = num_4x4_w;
+    int max_blocks_high = num_4x4_h;
+
+    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
+    // it to 4x4 block sizes.
+    if (xd->mb_to_right_edge < 0)
+      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+
+    if (xd->mb_to_bottom_edge < 0)
+      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+    i = 0;
+    // Unlike the normal case - in here we have to keep track of the
+    // row and column of the blocks we use so that we know if we are in
+    // the unrestricted motion border.
+    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
+      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+        if (r < max_blocks_high && c < max_blocks_wide)
+          visit(plane, i, plane_bsize, tx_size, arg);
+        i += step;
+      }
+    }
+  } else {
+    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
+      visit(plane, i, plane_bsize, tx_size, arg);
+  }
+}
+
+void vp9_foreach_transformed_block(const MACROBLOCKD* const xd,
+                                   BLOCK_SIZE bsize,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+}
+
+void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y;
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+#if CONFIG_ALPHA
+  // TODO(jkoleszar): Using the Y w/h for now
+  xd->plane[3].plane_type = PLANE_TYPE_Y;
+  xd->plane[3].subsampling_x = 0;
+  xd->plane[3].subsampling_y = 0;
+#endif
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
index bac40c52754..8ca356dd60f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
@@ -24,10 +24,14 @@
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_treecoder.h"
 
-#define BLOCK_SIZE_GROUPS   4
-#define MBSKIP_CONTEXTS 3
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
@@ -37,8 +41,9 @@
 #define REF_CONTEXTS 5
 
 typedef enum {
-  PLANE_TYPE_Y_WITH_DC,
-  PLANE_TYPE_UV,
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
 } PLANE_TYPE;
 
 typedef char ENTROPY_CONTEXT;
@@ -72,13 +77,9 @@ typedef enum {
   ZEROMV,
   NEWMV,
   MB_MODE_COUNT
-} MB_PREDICTION_MODE;
+} PREDICTION_MODE;
 
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
-  return mode <= TM_PRED;
-}
-
-static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
 
@@ -86,16 +87,14 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
 
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
-static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
-  return (mode - NEARESTMV);
-}
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
 typedef struct {
-  MB_PREDICTION_MODE as_mode;
+  PREDICTION_MODE as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
 } b_mode_info;
 
@@ -119,30 +118,25 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
   return mi_width_log2_lookup[sb_type];
 }
 
-static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
-  return mi_height_log2_lookup[sb_type];
-}
-
 // This structure now relates to 8x8 block regions.
 typedef struct {
-  MB_PREDICTION_MODE mode, uv_mode;
-  MV_REFERENCE_FRAME ref_frame[2];
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
   TX_SIZE tx_size;
-  int_mv mv[2];                // for each reference frame used
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv[2];
+  uint8_t skip;
+  uint8_t segment_id;
+  uint8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
-  uint8_t mode_context[MAX_REF_FRAMES];
-
-  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
-  unsigned char segment_id;    // Segment id for this block.
-
-  // Flags used for prediction status of various bit-stream signals
-  unsigned char seg_id_predicted;
+  // Only for INTRA blocks
+  PREDICTION_MODE uv_mode;
 
-  INTERPOLATION_TYPE interp_filter;
-
-  BLOCK_SIZE sb_type;
+  // Only for INTER blocks
+  MV_REFERENCE_FRAME ref_frame[2];
+  int_mv mv[2];
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  uint8_t mode_context[MAX_REF_FRAMES];
+  INTERP_FILTER interp_filter;
 } MB_MODE_INFO;
 
 typedef struct {
@@ -150,6 +144,11 @@ typedef struct {
   b_mode_info bmi[4];
 } MODE_INFO;
 
+static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                      : mi->mbmi.mode;
+}
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -158,6 +157,12 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
+PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b);
+
+PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b);
+
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
@@ -175,33 +180,34 @@ struct buf_2d {
 };
 
 struct macroblockd_plane {
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+  int16_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  int16_t *dequant;
+  const int16_t *dequant;
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
 };
 
 #define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
 
+typedef struct RefBuffer {
+  // TODO(dkovalev): idx is not really required and should be removed, now it
+  // is used in vp9_onyxd_if.c
+  int idx;
+  YV12_BUFFER_CONFIG *buf;
+  struct scale_factors sf;
+} RefBuffer;
+
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
-  struct scale_factors scale_factor[2];
-
-  MODE_INFO *last_mi;
-  int mode_info_stride;
+  int mi_stride;
 
   // A NULL indicates that the 8x8 is not part of the image
-  MODE_INFO **mi_8x8;
-  MODE_INFO **prev_mi_8x8;
-  MODE_INFO *mi_stream;
+  MODE_INFO **mi;
 
   int up_available;
   int left_available;
@@ -212,22 +218,23 @@ typedef struct macroblockd {
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
+  /* pointers to reference frames */
+  RefBuffer *block_refs[2];
+
+  /* pointer to current frame */
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  /* mc buffer */
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+
   int lossless;
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
-  struct subpix_fn_table  subpix;
-
   int corrupted;
 
-  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
-  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
-  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
-  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
+  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
 
-  int q_index;
-
-  /* Y,U,V,(A) */
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
@@ -235,184 +242,74 @@ typedef struct macroblockd {
   PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;
 
-
-
-static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
+                                     PARTITION_TYPE partition) {
   const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
   assert(subsize < BLOCK_SIZES);
   return subsize;
 }
 
-extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
+extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
 
-static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
+                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
-      xd->lossless ||
-      is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
     return DCT_DCT;
-
-  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
-                       mi->bmi[ib].as_mode : mbmi->mode];
+  return intra_mode_to_tx_type_lookup[mbmi->mode];
 }
 
-static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
-}
+static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
+                                      const MACROBLOCKD *xd, int ib) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
+    return DCT_DCT;
 
-static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
-                                        const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
+  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
 }
 
-static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
-  int i;
+void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
-    xd->plane[i].subsampling_x = i ? ss_x : 0;
-    xd->plane[i].subsampling_y = i ? ss_y : 0;
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
+  if (bsize < BLOCK_8X8) {
+    return TX_4X4;
+  } else {
+    // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1)
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1];
+    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
   }
-#if CONFIG_ALPHA
-  // TODO(jkoleszar): Using the Y w/h for now
-  xd->plane[3].subsampling_x = 0;
-  xd->plane[3].subsampling_y = 0;
-#endif
 }
 
-
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
 }
 
-static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
-                                       const struct macroblockd_plane *pd) {
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+    const struct macroblockd_plane *pd) {
   BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
   assert(bs < BLOCK_SIZES);
   return bs;
 }
 
-static INLINE int plane_block_width(BLOCK_SIZE bsize,
-                                    const struct macroblockd_plane* plane) {
-  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height(BLOCK_SIZE bsize,
-                                     const struct macroblockd_plane* plane) {
-  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
                                                   void *arg);
 
-static INLINE void foreach_transformed_block_in_plane(
+void vp9_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
-    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi;
-  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->tx_size;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int step = 1 << (tx_size << 1);
-  int i;
-
-  // If mb_to_right_edge is < 0 we are in a situation in which
-  // the current block size extends into the UMV and we won't
-  // visit the sub blocks that are wholly within the UMV.
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    int r, c;
-
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
-
-    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-    // it to 4x4 block sizes.
-    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-    i = 0;
-    // Unlike the normal case - in here we have to keep track of the
-    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
-        i += step;
-      }
-    }
-  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
-      visit(plane, i, plane_bsize, tx_size, arg);
-  }
-}
+    foreach_transformed_block_visitor visit, void *arg);
 
-static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg) {
-  int plane;
 
-  for (plane = 0; plane < MAX_MB_PLANE; plane++)
-    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-}
-
-static INLINE void foreach_transformed_block_uv(
+void vp9_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++)
-    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-}
-
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
-                                          int raster_block, uint8_t *base,
-                                          int stride) {
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
-static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
-                                       TX_SIZE tx_size, int block) {
-  const int bwl = b_width_log2(plane_bsize);
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
-  const int y = (raster_mb >> tx_cols_log2) << tx_size;
-  return x + (y << bwl);
-}
+    foreach_transformed_block_visitor visit, void *arg);
 
-static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
-                                     TX_SIZE tx_size, int block,
-                                     int *x, int *y) {
+static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+                                            TX_SIZE tx_size, int block,
+                                            int *x, int *y) {
   const int bwl = b_width_log2(plane_bsize);
   const int tx_cols_log2 = bwl - tx_size;
   const int tx_cols = 1 << tx_cols_log2;
@@ -421,104 +318,12 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
   *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
-                             int plane, int block, TX_SIZE tx_size) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  uint8_t *const buf = pd->dst.buf;
-  const int stride = pd->dst.stride;
-
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  x = x * 4 - 1;
-  y = y * 4 - 1;
-  // Copy a pixel into the umv if we are in a situation where the block size
-  // extends into the UMV.
-  // TODO(JBB): Should be able to do the full extend in place so we don't have
-  // to do this multiple times.
-  if (xd->mb_to_right_edge < 0) {
-    const int bw = 4 << b_width_log2(plane_bsize);
-    const int umv_border_start = bw + (xd->mb_to_right_edge >>
-                                       (3 + pd->subsampling_x));
-
-    if (x + bw > umv_border_start)
-      vpx_memset(&buf[y * stride + umv_border_start],
-                 buf[y * stride + umv_border_start - 1], bw);
-  }
+void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff);
 
-  if (xd->mb_to_bottom_edge < 0) {
-    if (xd->left_available || x >= 0) {
-      const int bh = 4 << b_height_log2(plane_bsize);
-      const int umv_border_start =
-          bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
-
-      if (y + bh > umv_border_start) {
-        const uint8_t c = buf[(umv_border_start - 1) * stride + x];
-        uint8_t *d = &buf[umv_border_start * stride + x];
-        int i;
-        for (i = 0; i < bh; ++i, d += stride)
-          *d = c;
-      }
-    }
-  }
-}
-static void set_contexts_on_border(MACROBLOCKD *xd,
-                                   struct macroblockd_plane *pd,
-                                   BLOCK_SIZE plane_bsize,
-                                   int tx_size_in_blocks, int has_eob,
-                                   int aoff, int loff,
-                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int above_contexts = tx_size_in_blocks;
-  int left_contexts = tx_size_in_blocks;
-  int pt;
-
-  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-  // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0)
-    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-  // this code attempts to avoid copying into contexts that are outside
-  // our border.  Any blocks that do are set to 0...
-  if (above_contexts + aoff > mi_blocks_wide)
-    above_contexts = mi_blocks_wide - aoff;
-
-  if (left_contexts + loff > mi_blocks_high)
-    left_contexts = mi_blocks_high - loff;
-
-  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = has_eob;
-  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
-    A[pt] = 0;
-  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = has_eob;
-  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
-    L[pt] = 0;
-}
-
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                         int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const L = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
-
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
-                           aoff, loff, A, L);
-  } else {
-    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-  }
-}
-
-static int get_tx_eob(const struct segmentation *seg, int segment_id,
-                      TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
index 36d1cdf1463..04db7c0bb0f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
@@ -18,6 +18,11 @@
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
@@ -40,7 +45,7 @@
     vpx_memcpy(dest, src, n * sizeof(*src)); \
   }
 
-#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest))
+#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest))
 #define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
 
 static INLINE uint8_t clip_pixel(int val) {
@@ -55,16 +60,8 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static int get_unsigned_bits(unsigned int num_values) {
-  int cat = 0;
-  if (num_values <= 1)
-    return 0;
-  num_values--;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
 #if CONFIG_DEBUG
@@ -91,4 +88,8 @@ static int get_unsigned_bits(unsigned int num_values) {
 #define VP9_FRAME_MARKER 0x2
 
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.c
index f858900a4f4..a927823e04b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.c
@@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
 const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
 const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
@@ -108,12 +106,6 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
   TX_16X16, TX_16X16, TX_16X16,
   TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16, TX_32X32
-};
 
 const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,  // ONLY_4X4
@@ -123,8 +115,6 @@ const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_32X32,  // TX_MODE_SELECT
 };
 
-
-
 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
 //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
 //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
@@ -143,4 +133,24 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
 
-
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES]= {
+  {15, 15},  // 4X4   - {0b1111, 0b1111}
+  {15, 14},  // 4X8   - {0b1111, 0b1110}
+  {14, 15},  // 8X4   - {0b1110, 0b1111}
+  {14, 14},  // 8X8   - {0b1110, 0b1110}
+  {14, 12},  // 8X16  - {0b1110, 0b1100}
+  {12, 14},  // 16X8  - {0b1100, 0b1110}
+  {12, 12},  // 16X16 - {0b1100, 0b1100}
+  {12, 8 },  // 16X32 - {0b1100, 0b1000}
+  {8,  12},  // 32X16 - {0b1000, 0b1100}
+  {8,  8 },  // 32X32 - {0b1000, 0b1000}
+  {8,  0 },  // 32X64 - {0b1000, 0b0000}
+  {0,  8 },  // 64X32 - {0b0000, 0b1000}
+  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+};
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.h
index c1f6405364b..f4196274722 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common_data.h
@@ -13,10 +13,13 @@
 
 #include "vp9/common/vp9_enums.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 extern const int b_width_log2_lookup[BLOCK_SIZES];
 extern const int b_height_log2_lookup[BLOCK_SIZES];
 extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int mi_height_log2_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
 extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
@@ -26,8 +29,11 @@ extern const int num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
-#endif  // VP9_COMMON_VP9_COMMON_DATA_H
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_COMMON_DATA_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
index a2d864c72e1..1a8c49d52c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
@@ -18,40 +18,21 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x0, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters,
+                           int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
-    /* Initial phase offset */
-    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
-
+    int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[src_x + k] * filter_x[k];
-
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -59,41 +40,22 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x0, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters,
+                               int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
-    /* Initial phase offset */
-    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
-
+    int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[src_x + k] * filter_x[k];
-
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -101,41 +63,22 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y0, int y_step_q4,
-                            int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
-    /* Initial phase offset */
-    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
-
+    int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       y_q4 += y_step_q4;
     }
     ++src;
@@ -143,41 +86,23 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y0, int y_step_q4,
-                                int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters,
+                              int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
-    /* Initial phase offset */
-    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
-
+    int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -185,33 +110,42 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 324, for y_step_q4 == 80,
-   * h == 64, taps == 8.
-   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-   */
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const InterpKernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const InterpKernel *const y_filters,
+                     int y0_q4, int y_step_q4,
+                     int w, int h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  // Maximum intermediate_height is 324, for y_step_q4 == 80,
+  // h == 64, taps == 8.
+  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
   uint8_t temp[64 * 324];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(taps <= 8);
   assert(y_step_q4 <= 80);
   assert(x_step_q4 <= 80);
 
   if (intermediate_height < h)
     intermediate_height = h;
 
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w,
-                   intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
-                  x_step_q4, filter_y, y_step_q4, w, h, taps);
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -219,8 +153,14 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                 x0_q4, x_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -228,8 +168,14 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
-  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                     x0_q4, x_step_q4, w, h);
 }
 
 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -237,8 +183,14 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
-  convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -246,8 +198,14 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                    y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -255,8 +213,15 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
-  convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve(src, src_stride, dst, dst_stride,
+           filters_x, x0_q4, x_step_q4,
+           filters_y, y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -269,9 +234,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_convolve8(src, src_stride, temp, 64,
-               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vp9_convolve8_c(src, src_stride, temp, 64,
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }
 
 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -281,6 +246,9 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
                          int w, int h) {
   int r;
 
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
   for (r = h; r > 0; --r) {
     vpx_memcpy(dst, src, w);
     src += src_stride;
@@ -295,6 +263,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                         int w, int h) {
   int x, y;
 
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x)
       dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.h
index 29d499063c6..6bf71fc7943 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.h
@@ -13,10 +13,18 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_CONVOLVE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
index 355ac1a4900..d2522bbdfc8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
@@ -22,12 +22,11 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
  * and uses the passed in member offset to print out the value of an integer
  * for each mbmi member value in the mi structure.
  */
-static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
-  int mi_row;
-  int mi_col;
+  int mi_row, mi_col;
   int mi_index = 0;
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
@@ -38,7 +37,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
+              *((int*) ((char *) (&mi[mi_index]->mbmi) +
                         member_offset)));
       mi_index++;
     }
@@ -47,18 +46,18 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
   }
   fprintf(file, "\n");
 }
-void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
   int mi_index = 0;
   FILE *mvs = fopen(file, "a");
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
@@ -67,8 +66,8 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
+                               mi[mi_index]->mbmi.mv[0].as_mv.col);
       mi_index++;
     }
     fprintf(mvs, "\n");
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_default_coef_probs.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_default_coef_probs.h
deleted file mode 100644
index 3b512beb9d3..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_default_coef_probs.h
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-*/
-#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
-#define VP9_COMMON_DEFAULT_COEF_PROBS_H_
-
-/*Generated file, included by vp9_entropy.c*/
-static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 195,  29, 183 },
-        {  84,  49, 136 },
-        {   8,  42,  71 }
-      }, { /* Coeff Band 1 */
-        {  31, 107, 169 },
-        {  35,  99, 159 },
-        {  17,  82, 140 },
-        {   8,  66, 114 },
-        {   2,  44,  76 },
-        {   1,  19,  32 }
-      }, { /* Coeff Band 2 */
-        {  40, 132, 201 },
-        {  29, 114, 187 },
-        {  13,  91, 157 },
-        {   7,  75, 127 },
-        {   3,  58,  95 },
-        {   1,  28,  47 }
-      }, { /* Coeff Band 3 */
-        {  69, 142, 221 },
-        {  42, 122, 201 },
-        {  15,  91, 159 },
-        {   6,  67, 121 },
-        {   1,  42,  77 },
-        {   1,  17,  31 }
-      }, { /* Coeff Band 4 */
-        { 102, 148, 228 },
-        {  67, 117, 204 },
-        {  17,  82, 154 },
-        {   6,  59, 114 },
-        {   2,  39,  75 },
-        {   1,  15,  29 }
-      }, { /* Coeff Band 5 */
-        { 156,  57, 233 },
-        { 119,  57, 212 },
-        {  58,  48, 163 },
-        {  29,  40, 124 },
-        {  12,  30,  81 },
-        {   3,  12,  31 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 191, 107, 226 },
-        { 124, 117, 204 },
-        {  25,  99, 155 }
-      }, { /* Coeff Band 1 */
-        {  29, 148, 210 },
-        {  37, 126, 194 },
-        {   8,  93, 157 },
-        {   2,  68, 118 },
-        {   1,  39,  69 },
-        {   1,  17,  33 }
-      }, { /* Coeff Band 2 */
-        {  41, 151, 213 },
-        {  27, 123, 193 },
-        {   3,  82, 144 },
-        {   1,  58, 105 },
-        {   1,  32,  60 },
-        {   1,  13,  26 }
-      }, { /* Coeff Band 3 */
-        {  59, 159, 220 },
-        {  23, 126, 198 },
-        {   4,  88, 151 },
-        {   1,  66, 114 },
-        {   1,  38,  71 },
-        {   1,  18,  34 }
-      }, { /* Coeff Band 4 */
-        { 114, 136, 232 },
-        {  51, 114, 207 },
-        {  11,  83, 155 },
-        {   3,  56, 105 },
-        {   1,  33,  65 },
-        {   1,  17,  34 }
-      }, { /* Coeff Band 5 */
-        { 149,  65, 234 },
-        { 121,  57, 215 },
-        {  61,  49, 166 },
-        {  28,  36, 114 },
-        {  12,  25,  76 },
-        {   3,  16,  42 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 214,  49, 220 },
-        { 132,  63, 188 },
-        {  42,  65, 137 }
-      }, { /* Coeff Band 1 */
-        {  85, 137, 221 },
-        { 104, 131, 216 },
-        {  49, 111, 192 },
-        {  21,  87, 155 },
-        {   2,  49,  87 },
-        {   1,  16,  28 }
-      }, { /* Coeff Band 2 */
-        {  89, 163, 230 },
-        {  90, 137, 220 },
-        {  29, 100, 183 },
-        {  10,  70, 135 },
-        {   2,  42,  81 },
-        {   1,  17,  33 }
-      }, { /* Coeff Band 3 */
-        { 108, 167, 237 },
-        {  55, 133, 222 },
-        {  15,  97, 179 },
-        {   4,  72, 135 },
-        {   1,  45,  85 },
-        {   1,  19,  38 }
-      }, { /* Coeff Band 4 */
-        { 124, 146, 240 },
-        {  66, 124, 224 },
-        {  17,  88, 175 },
-        {   4,  58, 122 },
-        {   1,  36,  75 },
-        {   1,  18,  37 }
-      }, { /* Coeff Band 5 */
-        { 141,  79, 241 },
-        { 126,  70, 227 },
-        {  66,  58, 182 },
-        {  30,  44, 136 },
-        {  12,  34,  96 },
-        {   2,  20,  47 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 229,  99, 249 },
-        { 143, 111, 235 },
-        {  46, 109, 192 }
-      }, { /* Coeff Band 1 */
-        {  82, 158, 236 },
-        {  94, 146, 224 },
-        {  25, 117, 191 },
-        {   9,  87, 149 },
-        {   3,  56,  99 },
-        {   1,  33,  57 }
-      }, { /* Coeff Band 2 */
-        {  83, 167, 237 },
-        {  68, 145, 222 },
-        {  10, 103, 177 },
-        {   2,  72, 131 },
-        {   1,  41,  79 },
-        {   1,  20,  39 }
-      }, { /* Coeff Band 3 */
-        {  99, 167, 239 },
-        {  47, 141, 224 },
-        {  10, 104, 178 },
-        {   2,  73, 133 },
-        {   1,  44,  85 },
-        {   1,  22,  47 }
-      }, { /* Coeff Band 4 */
-        { 127, 145, 243 },
-        {  71, 129, 228 },
-        {  17,  93, 177 },
-        {   3,  61, 124 },
-        {   1,  41,  84 },
-        {   1,  21,  52 }
-      }, { /* Coeff Band 5 */
-        { 157,  78, 244 },
-        { 140,  72, 231 },
-        {  69,  58, 184 },
-        {  31,  44, 137 },
-        {  14,  38, 105 },
-        {   8,  23,  61 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 125,  34, 187 },
-        {  52,  41, 133 },
-        {   6,  31,  56 }
-      }, { /* Coeff Band 1 */
-        {  37, 109, 153 },
-        {  51, 102, 147 },
-        {  23,  87, 128 },
-        {   8,  67, 101 },
-        {   1,  41,  63 },
-        {   1,  19,  29 }
-      }, { /* Coeff Band 2 */
-        {  31, 154, 185 },
-        {  17, 127, 175 },
-        {   6,  96, 145 },
-        {   2,  73, 114 },
-        {   1,  51,  82 },
-        {   1,  28,  45 }
-      }, { /* Coeff Band 3 */
-        {  23, 163, 200 },
-        {  10, 131, 185 },
-        {   2,  93, 148 },
-        {   1,  67, 111 },
-        {   1,  41,  69 },
-        {   1,  14,  24 }
-      }, { /* Coeff Band 4 */
-        {  29, 176, 217 },
-        {  12, 145, 201 },
-        {   3, 101, 156 },
-        {   1,  69, 111 },
-        {   1,  39,  63 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 5 */
-        {  57, 192, 233 },
-        {  25, 154, 215 },
-        {   6, 109, 167 },
-        {   3,  78, 118 },
-        {   1,  48,  69 },
-        {   1,  21,  29 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 202, 105, 245 },
-        { 108, 106, 216 },
-        {  18,  90, 144 }
-      }, { /* Coeff Band 1 */
-        {  33, 172, 219 },
-        {  64, 149, 206 },
-        {  14, 117, 177 },
-        {   5,  90, 141 },
-        {   2,  61,  95 },
-        {   1,  37,  57 }
-      }, { /* Coeff Band 2 */
-        {  33, 179, 220 },
-        {  11, 140, 198 },
-        {   1,  89, 148 },
-        {   1,  60, 104 },
-        {   1,  33,  57 },
-        {   1,  12,  21 }
-      }, { /* Coeff Band 3 */
-        {  30, 181, 221 },
-        {   8, 141, 198 },
-        {   1,  87, 145 },
-        {   1,  58, 100 },
-        {   1,  31,  55 },
-        {   1,  12,  20 }
-      }, { /* Coeff Band 4 */
-        {  32, 186, 224 },
-        {   7, 142, 198 },
-        {   1,  86, 143 },
-        {   1,  58, 100 },
-        {   1,  31,  55 },
-        {   1,  12,  22 }
-      }, { /* Coeff Band 5 */
-        {  57, 192, 227 },
-        {  20, 143, 204 },
-        {   3,  96, 154 },
-        {   1,  68, 112 },
-        {   1,  42,  69 },
-        {   1,  19,  32 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 212,  35, 215 },
-        { 113,  47, 169 },
-        {  29,  48, 105 }
-      }, { /* Coeff Band 1 */
-        {  74, 129, 203 },
-        { 106, 120, 203 },
-        {  49, 107, 178 },
-        {  19,  84, 144 },
-        {   4,  50,  84 },
-        {   1,  15,  25 }
-      }, { /* Coeff Band 2 */
-        {  71, 172, 217 },
-        {  44, 141, 209 },
-        {  15, 102, 173 },
-        {   6,  76, 133 },
-        {   2,  51,  89 },
-        {   1,  24,  42 }
-      }, { /* Coeff Band 3 */
-        {  64, 185, 231 },
-        {  31, 148, 216 },
-        {   8, 103, 175 },
-        {   3,  74, 131 },
-        {   1,  46,  81 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 4 */
-        {  65, 196, 235 },
-        {  25, 157, 221 },
-        {   5, 105, 174 },
-        {   1,  67, 120 },
-        {   1,  38,  69 },
-        {   1,  15,  30 }
-      }, { /* Coeff Band 5 */
-        {  65, 204, 238 },
-        {  30, 156, 224 },
-        {   7, 107, 177 },
-        {   2,  70, 124 },
-        {   1,  42,  73 },
-        {   1,  18,  34 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 225,  86, 251 },
-        { 144, 104, 235 },
-        {  42,  99, 181 }
-      }, { /* Coeff Band 1 */
-        {  85, 175, 239 },
-        { 112, 165, 229 },
-        {  29, 136, 200 },
-        {  12, 103, 162 },
-        {   6,  77, 123 },
-        {   2,  53,  84 }
-      }, { /* Coeff Band 2 */
-        {  75, 183, 239 },
-        {  30, 155, 221 },
-        {   3, 106, 171 },
-        {   1,  74, 128 },
-        {   1,  44,  76 },
-        {   1,  17,  28 }
-      }, { /* Coeff Band 3 */
-        {  73, 185, 240 },
-        {  27, 159, 222 },
-        {   2, 107, 172 },
-        {   1,  75, 127 },
-        {   1,  42,  73 },
-        {   1,  17,  29 }
-      }, { /* Coeff Band 4 */
-        {  62, 190, 238 },
-        {  21, 159, 222 },
-        {   2, 107, 172 },
-        {   1,  72, 122 },
-        {   1,  40,  71 },
-        {   1,  18,  32 }
-      }, { /* Coeff Band 5 */
-        {  61, 199, 240 },
-        {  27, 161, 226 },
-        {   4, 113, 180 },
-        {   1,  76, 129 },
-        {   1,  46,  80 },
-        {   1,  23,  41 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   7,  27, 153 },
-        {   5,  30,  95 },
-        {   1,  16,  30 }
-      }, { /* Coeff Band 1 */
-        {  50,  75, 127 },
-        {  57,  75, 124 },
-        {  27,  67, 108 },
-        {  10,  54,  86 },
-        {   1,  33,  52 },
-        {   1,  12,  18 }
-      }, { /* Coeff Band 2 */
-        {  43, 125, 151 },
-        {  26, 108, 148 },
-        {   7,  83, 122 },
-        {   2,  59,  89 },
-        {   1,  38,  60 },
-        {   1,  17,  27 }
-      }, { /* Coeff Band 3 */
-        {  23, 144, 163 },
-        {  13, 112, 154 },
-        {   2,  75, 117 },
-        {   1,  50,  81 },
-        {   1,  31,  51 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 4 */
-        {  18, 162, 185 },
-        {   6, 123, 171 },
-        {   1,  78, 125 },
-        {   1,  51,  86 },
-        {   1,  31,  54 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 5 */
-        {  15, 199, 227 },
-        {   3, 150, 204 },
-        {   1,  91, 146 },
-        {   1,  55,  95 },
-        {   1,  30,  53 },
-        {   1,  11,  20 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  19,  55, 240 },
-        {  19,  59, 196 },
-        {   3,  52, 105 }
-      }, { /* Coeff Band 1 */
-        {  41, 166, 207 },
-        { 104, 153, 199 },
-        {  31, 123, 181 },
-        {  14, 101, 152 },
-        {   5,  72, 106 },
-        {   1,  36,  52 }
-      }, { /* Coeff Band 2 */
-        {  35, 176, 211 },
-        {  12, 131, 190 },
-        {   2,  88, 144 },
-        {   1,  60, 101 },
-        {   1,  36,  60 },
-        {   1,  16,  28 }
-      }, { /* Coeff Band 3 */
-        {  28, 183, 213 },
-        {   8, 134, 191 },
-        {   1,  86, 142 },
-        {   1,  56,  96 },
-        {   1,  30,  53 },
-        {   1,  12,  20 }
-      }, { /* Coeff Band 4 */
-        {  20, 190, 215 },
-        {   4, 135, 192 },
-        {   1,  84, 139 },
-        {   1,  53,  91 },
-        {   1,  28,  49 },
-        {   1,  11,  20 }
-      }, { /* Coeff Band 5 */
-        {  13, 196, 216 },
-        {   2, 137, 192 },
-        {   1,  86, 143 },
-        {   1,  57,  99 },
-        {   1,  32,  56 },
-        {   1,  13,  24 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 211,  29, 217 },
-        {  96,  47, 156 },
-        {  22,  43,  87 }
-      }, { /* Coeff Band 1 */
-        {  78, 120, 193 },
-        { 111, 116, 186 },
-        {  46, 102, 164 },
-        {  15,  80, 128 },
-        {   2,  49,  76 },
-        {   1,  18,  28 }
-      }, { /* Coeff Band 2 */
-        {  71, 161, 203 },
-        {  42, 132, 192 },
-        {  10,  98, 150 },
-        {   3,  69, 109 },
-        {   1,  44,  70 },
-        {   1,  18,  29 }
-      }, { /* Coeff Band 3 */
-        {  57, 186, 211 },
-        {  30, 140, 196 },
-        {   4,  93, 146 },
-        {   1,  62, 102 },
-        {   1,  38,  65 },
-        {   1,  16,  27 }
-      }, { /* Coeff Band 4 */
-        {  47, 199, 217 },
-        {  14, 145, 196 },
-        {   1,  88, 142 },
-        {   1,  57,  98 },
-        {   1,  36,  62 },
-        {   1,  15,  26 }
-      }, { /* Coeff Band 5 */
-        {  26, 219, 229 },
-        {   5, 155, 207 },
-        {   1,  94, 151 },
-        {   1,  60, 104 },
-        {   1,  36,  62 },
-        {   1,  16,  28 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 233,  29, 248 },
-        { 146,  47, 220 },
-        {  43,  52, 140 }
-      }, { /* Coeff Band 1 */
-        { 100, 163, 232 },
-        { 179, 161, 222 },
-        {  63, 142, 204 },
-        {  37, 113, 174 },
-        {  26,  89, 137 },
-        {  18,  68,  97 }
-      }, { /* Coeff Band 2 */
-        {  85, 181, 230 },
-        {  32, 146, 209 },
-        {   7, 100, 164 },
-        {   3,  71, 121 },
-        {   1,  45,  77 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 3 */
-        {  65, 187, 230 },
-        {  20, 148, 207 },
-        {   2,  97, 159 },
-        {   1,  68, 116 },
-        {   1,  40,  70 },
-        {   1,  14,  29 }
-      }, { /* Coeff Band 4 */
-        {  40, 194, 227 },
-        {   8, 147, 204 },
-        {   1,  94, 155 },
-        {   1,  65, 112 },
-        {   1,  39,  66 },
-        {   1,  14,  26 }
-      }, { /* Coeff Band 5 */
-        {  16, 208, 228 },
-        {   3, 151, 207 },
-        {   1,  98, 160 },
-        {   1,  67, 117 },
-        {   1,  41,  74 },
-        {   1,  17,  31 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {  17,  38, 140 },
-        {   7,  34,  80 },
-        {   1,  17,  29 }
-      }, { /* Coeff Band 1 */
-        {  37,  75, 128 },
-        {  41,  76, 128 },
-        {  26,  66, 116 },
-        {  12,  52,  94 },
-        {   2,  32,  55 },
-        {   1,  10,  16 }
-      }, { /* Coeff Band 2 */
-        {  50, 127, 154 },
-        {  37, 109, 152 },
-        {  16,  82, 121 },
-        {   5,  59,  85 },
-        {   1,  35,  54 },
-        {   1,  13,  20 }
-      }, { /* Coeff Band 3 */
-        {  40, 142, 167 },
-        {  17, 110, 157 },
-        {   2,  71, 112 },
-        {   1,  44,  72 },
-        {   1,  27,  45 },
-        {   1,  11,  17 }
-      }, { /* Coeff Band 4 */
-        {  30, 175, 188 },
-        {   9, 124, 169 },
-        {   1,  74, 116 },
-        {   1,  48,  78 },
-        {   1,  30,  49 },
-        {   1,  11,  18 }
-      }, { /* Coeff Band 5 */
-        {  10, 222, 223 },
-        {   2, 150, 194 },
-        {   1,  83, 128 },
-        {   1,  48,  79 },
-        {   1,  27,  45 },
-        {   1,  11,  17 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  36,  41, 235 },
-        {  29,  36, 193 },
-        {  10,  27, 111 }
-      }, { /* Coeff Band 1 */
-        {  85, 165, 222 },
-        { 177, 162, 215 },
-        { 110, 135, 195 },
-        {  57, 113, 168 },
-        {  23,  83, 120 },
-        {  10,  49,  61 }
-      }, { /* Coeff Band 2 */
-        {  85, 190, 223 },
-        {  36, 139, 200 },
-        {   5,  90, 146 },
-        {   1,  60, 103 },
-        {   1,  38,  65 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 3 */
-        {  72, 202, 223 },
-        {  23, 141, 199 },
-        {   2,  86, 140 },
-        {   1,  56,  97 },
-        {   1,  36,  61 },
-        {   1,  16,  27 }
-      }, { /* Coeff Band 4 */
-        {  55, 218, 225 },
-        {  13, 145, 200 },
-        {   1,  86, 141 },
-        {   1,  57,  99 },
-        {   1,  35,  61 },
-        {   1,  13,  22 }
-      }, { /* Coeff Band 5 */
-        {  15, 235, 212 },
-        {   1, 132, 184 },
-        {   1,  84, 139 },
-        {   1,  57,  97 },
-        {   1,  34,  56 },
-        {   1,  14,  23 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 181,  21, 201 },
-        {  61,  37, 123 },
-        {  10,  38,  71 }
-      }, { /* Coeff Band 1 */
-        {  47, 106, 172 },
-        {  95, 104, 173 },
-        {  42,  93, 159 },
-        {  18,  77, 131 },
-        {   4,  50,  81 },
-        {   1,  17,  23 }
-      }, { /* Coeff Band 2 */
-        {  62, 147, 199 },
-        {  44, 130, 189 },
-        {  28, 102, 154 },
-        {  18,  75, 115 },
-        {   2,  44,  65 },
-        {   1,  12,  19 }
-      }, { /* Coeff Band 3 */
-        {  55, 153, 210 },
-        {  24, 130, 194 },
-        {   3,  93, 146 },
-        {   1,  61,  97 },
-        {   1,  31,  50 },
-        {   1,  10,  16 }
-      }, { /* Coeff Band 4 */
-        {  49, 186, 223 },
-        {  17, 148, 204 },
-        {   1,  96, 142 },
-        {   1,  53,  83 },
-        {   1,  26,  44 },
-        {   1,  11,  17 }
-      }, { /* Coeff Band 5 */
-        {  13, 217, 212 },
-        {   2, 136, 180 },
-        {   1,  78, 124 },
-        {   1,  50,  83 },
-        {   1,  29,  49 },
-        {   1,  14,  23 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 197,  13, 247 },
-        {  82,  17, 222 },
-        {  25,  17, 162 }
-      }, { /* Coeff Band 1 */
-        { 126, 186, 247 },
-        { 234, 191, 243 },
-        { 176, 177, 234 },
-        { 104, 158, 220 },
-        {  66, 128, 186 },
-        {  55,  90, 137 }
-      }, { /* Coeff Band 2 */
-        { 111, 197, 242 },
-        {  46, 158, 219 },
-        {   9, 104, 171 },
-        {   2,  65, 125 },
-        {   1,  44,  80 },
-        {   1,  17,  91 }
-      }, { /* Coeff Band 3 */
-        { 104, 208, 245 },
-        {  39, 168, 224 },
-        {   3, 109, 162 },
-        {   1,  79, 124 },
-        {   1,  50, 102 },
-        {   1,  43, 102 }
-      }, { /* Coeff Band 4 */
-        {  84, 220, 246 },
-        {  31, 177, 231 },
-        {   2, 115, 180 },
-        {   1,  79, 134 },
-        {   1,  55,  77 },
-        {   1,  60,  79 }
-      }, { /* Coeff Band 5 */
-        {  43, 243, 240 },
-        {   8, 180, 217 },
-        {   1, 115, 166 },
-        {   1,  84, 121 },
-        {   1,  51,  67 },
-        {   1,  16,   6 }
-      }
-    }
-  }
-};
-
-#endif  // VP9_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
index 2640ac72b28..bc12f9aa2f8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
@@ -15,78 +15,87 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 5
+  4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
-  5, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
+const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-
-
-/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
-  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-  -ONE_TOKEN, 6,                              /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                   /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
-};
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
-  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
-const vp9_tree_index vp9_coefmodel_tree[6] = {
-  -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = {
+  -EOB_MODEL_TOKEN, 2,
+  -ZERO_TOKEN, 4,
   -ONE_TOKEN, -TWO_TOKEN,
 };
 
@@ -99,198 +108,617 @@ const vp9_tree_index vp9_coefmodel_tree[6] = {
 // the probabilities for the rest of the nodes.
 
 // beta = 8
-static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+
+// Every odd line in this table can be generated from the even lines
+// by averaging :
+// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
+//                              vp9_pareto8_full[l+1][node] ) >> 1;
+const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
+  {  6,  86, 128,  11,  87,  42,  91,  52},
   {  9,  86, 129,  17,  88,  61,  94,  76},
+  { 12,  86, 129,  22,  88,  77,  97,  93},
   { 15,  87, 129,  28,  89,  93, 100, 110},
+  { 17,  87, 129,  33,  90, 105, 103, 123},
   { 20,  88, 130,  38,  91, 118, 106, 136},
+  { 23,  88, 130,  43,  91, 128, 108, 146},
   { 26,  89, 131,  48,  92, 139, 111, 156},
+  { 28,  89, 131,  53,  93, 147, 114, 163},
   { 31,  90, 131,  58,  94, 156, 117, 171},
+  { 34,  90, 131,  62,  94, 163, 119, 177},
   { 37,  90, 132,  66,  95, 171, 122, 184},
+  { 39,  90, 132,  70,  96, 177, 124, 189},
   { 42,  91, 132,  75,  97, 183, 127, 194},
+  { 44,  91, 132,  79,  97, 188, 129, 198},
   { 47,  92, 133,  83,  98, 193, 132, 202},
+  { 49,  92, 133,  86,  99, 197, 134, 205},
   { 52,  93, 133,  90, 100, 201, 137, 208},
+  { 54,  93, 133,  94, 100, 204, 139, 211},
   { 57,  94, 134,  98, 101, 208, 142, 214},
+  { 59,  94, 134, 101, 102, 211, 144, 216},
   { 62,  94, 135, 105, 103, 214, 146, 218},
+  { 64,  94, 135, 108, 103, 216, 148, 220},
   { 66,  95, 135, 111, 104, 219, 151, 222},
+  { 68,  95, 135, 114, 105, 221, 153, 223},
   { 71,  96, 136, 117, 106, 224, 155, 225},
+  { 73,  96, 136, 120, 106, 225, 157, 226},
   { 76,  97, 136, 123, 107, 227, 159, 228},
+  { 78,  97, 136, 126, 108, 229, 160, 229},
   { 80,  98, 137, 129, 109, 231, 162, 231},
+  { 82,  98, 137, 131, 109, 232, 164, 232},
   { 84,  98, 138, 134, 110, 234, 166, 233},
+  { 86,  98, 138, 137, 111, 235, 168, 234},
   { 89,  99, 138, 140, 112, 236, 170, 235},
+  { 91,  99, 138, 142, 112, 237, 171, 235},
   { 93, 100, 139, 145, 113, 238, 173, 236},
+  { 95, 100, 139, 147, 114, 239, 174, 237},
   { 97, 101, 140, 149, 115, 240, 176, 238},
+  { 99, 101, 140, 151, 115, 241, 177, 238},
   {101, 102, 140, 154, 116, 242, 179, 239},
+  {103, 102, 140, 156, 117, 242, 180, 239},
   {105, 103, 141, 158, 118, 243, 182, 240},
+  {107, 103, 141, 160, 118, 243, 183, 240},
   {109, 104, 141, 162, 119, 244, 185, 241},
+  {111, 104, 141, 164, 119, 244, 186, 241},
   {113, 104, 142, 166, 120, 245, 187, 242},
+  {114, 104, 142, 168, 121, 245, 188, 242},
   {116, 105, 143, 170, 122, 246, 190, 243},
+  {118, 105, 143, 171, 122, 246, 191, 243},
   {120, 106, 143, 173, 123, 247, 192, 244},
+  {121, 106, 143, 175, 124, 247, 193, 244},
   {123, 107, 144, 177, 125, 248, 195, 244},
+  {125, 107, 144, 178, 125, 248, 196, 244},
   {127, 108, 145, 180, 126, 249, 197, 245},
+  {128, 108, 145, 181, 127, 249, 198, 245},
   {130, 109, 145, 183, 128, 249, 199, 245},
+  {132, 109, 145, 184, 128, 249, 200, 245},
   {134, 110, 146, 186, 129, 250, 201, 246},
+  {135, 110, 146, 187, 130, 250, 202, 246},
   {137, 111, 147, 189, 131, 251, 203, 246},
+  {138, 111, 147, 190, 131, 251, 204, 246},
   {140, 112, 147, 192, 132, 251, 205, 247},
+  {141, 112, 147, 193, 132, 251, 206, 247},
   {143, 113, 148, 194, 133, 251, 207, 247},
+  {144, 113, 148, 195, 134, 251, 207, 247},
   {146, 114, 149, 197, 135, 252, 208, 248},
+  {147, 114, 149, 198, 135, 252, 209, 248},
   {149, 115, 149, 199, 136, 252, 210, 248},
+  {150, 115, 149, 200, 137, 252, 210, 248},
   {152, 115, 150, 201, 138, 252, 211, 248},
+  {153, 115, 150, 202, 138, 252, 212, 248},
   {155, 116, 151, 204, 139, 253, 213, 249},
+  {156, 116, 151, 205, 139, 253, 213, 249},
   {158, 117, 151, 206, 140, 253, 214, 249},
+  {159, 117, 151, 207, 141, 253, 215, 249},
   {161, 118, 152, 208, 142, 253, 216, 249},
+  {162, 118, 152, 209, 142, 253, 216, 249},
   {163, 119, 153, 210, 143, 253, 217, 249},
+  {164, 119, 153, 211, 143, 253, 217, 249},
   {166, 120, 153, 212, 144, 254, 218, 250},
+  {167, 120, 153, 212, 145, 254, 219, 250},
   {168, 121, 154, 213, 146, 254, 220, 250},
+  {169, 121, 154, 214, 146, 254, 220, 250},
   {171, 122, 155, 215, 147, 254, 221, 250},
+  {172, 122, 155, 216, 147, 254, 221, 250},
   {173, 123, 155, 217, 148, 254, 222, 250},
+  {174, 123, 155, 217, 149, 254, 222, 250},
   {176, 124, 156, 218, 150, 254, 223, 250},
+  {177, 124, 156, 219, 150, 254, 223, 250},
   {178, 125, 157, 220, 151, 254, 224, 251},
+  {179, 125, 157, 220, 151, 254, 224, 251},
   {180, 126, 157, 221, 152, 254, 225, 251},
+  {181, 126, 157, 221, 152, 254, 225, 251},
   {183, 127, 158, 222, 153, 254, 226, 251},
+  {184, 127, 158, 223, 154, 254, 226, 251},
   {185, 128, 159, 224, 155, 255, 227, 251},
+  {186, 128, 159, 224, 155, 255, 227, 251},
   {187, 129, 160, 225, 156, 255, 228, 251},
+  {188, 130, 160, 225, 156, 255, 228, 251},
   {189, 131, 160, 226, 157, 255, 228, 251},
+  {190, 131, 160, 226, 158, 255, 228, 251},
   {191, 132, 161, 227, 159, 255, 229, 251},
+  {192, 132, 161, 227, 159, 255, 229, 251},
   {193, 133, 162, 228, 160, 255, 230, 252},
+  {194, 133, 162, 229, 160, 255, 230, 252},
   {195, 134, 163, 230, 161, 255, 231, 252},
+  {196, 134, 163, 230, 161, 255, 231, 252},
   {197, 135, 163, 231, 162, 255, 231, 252},
+  {198, 135, 163, 231, 162, 255, 231, 252},
   {199, 136, 164, 232, 163, 255, 232, 252},
+  {200, 136, 164, 232, 164, 255, 232, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
   {201, 137, 165, 233, 165, 255, 233, 252},
   {202, 138, 166, 233, 166, 255, 233, 252},
+  {203, 138, 166, 233, 166, 255, 233, 252},
   {204, 139, 166, 234, 167, 255, 234, 252},
+  {205, 139, 166, 234, 167, 255, 234, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
   {206, 140, 167, 235, 168, 255, 235, 252},
   {207, 141, 168, 236, 169, 255, 235, 252},
+  {208, 141, 168, 236, 170, 255, 235, 252},
   {209, 142, 169, 237, 171, 255, 236, 252},
+  {209, 143, 169, 237, 171, 255, 236, 252},
   {210, 144, 169, 237, 172, 255, 236, 252},
+  {211, 144, 169, 237, 172, 255, 236, 252},
   {212, 145, 170, 238, 173, 255, 237, 252},
+  {213, 145, 170, 238, 173, 255, 237, 252},
   {214, 146, 171, 239, 174, 255, 237, 253},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
   {215, 147, 172, 240, 175, 255, 238, 253},
   {216, 148, 173, 240, 176, 255, 238, 253},
+  {217, 148, 173, 240, 176, 255, 238, 253},
   {218, 149, 173, 241, 177, 255, 239, 253},
+  {218, 149, 173, 241, 178, 255, 239, 253},
   {219, 150, 174, 241, 179, 255, 239, 253},
+  {219, 151, 174, 241, 179, 255, 239, 253},
   {220, 152, 175, 242, 180, 255, 240, 253},
+  {221, 152, 175, 242, 180, 255, 240, 253},
   {222, 153, 176, 242, 181, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
   {223, 154, 177, 243, 182, 255, 240, 253},
   {224, 155, 178, 244, 183, 255, 241, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
   {225, 156, 178, 244, 184, 255, 241, 253},
+  {225, 157, 178, 244, 184, 255, 241, 253},
   {226, 158, 179, 244, 185, 255, 242, 253},
+  {227, 158, 179, 244, 185, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
   {228, 159, 180, 245, 186, 255, 242, 253},
   {229, 160, 181, 245, 187, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
   {230, 161, 182, 246, 188, 255, 243, 253},
+  {230, 162, 182, 246, 188, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
   {231, 163, 183, 246, 189, 255, 243, 253},
   {232, 164, 184, 247, 190, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
   {233, 165, 185, 247, 191, 255, 244, 253},
   {234, 166, 185, 247, 192, 255, 244, 253},
+  {234, 167, 185, 247, 192, 255, 244, 253},
   {235, 168, 186, 248, 193, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
   {236, 169, 187, 248, 194, 255, 244, 253},
   {236, 170, 188, 248, 195, 255, 245, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
   {237, 171, 189, 249, 196, 255, 245, 254},
+  {237, 172, 189, 249, 196, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
   {238, 173, 190, 249, 197, 255, 245, 254},
   {239, 174, 191, 249, 198, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
   {240, 175, 192, 249, 199, 255, 246, 254},
+  {240, 176, 192, 249, 199, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
   {240, 177, 193, 250, 200, 255, 246, 254},
   {241, 178, 194, 250, 201, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
   {242, 179, 195, 250, 202, 255, 246, 254},
+  {242, 180, 195, 250, 202, 255, 246, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
   {242, 181, 196, 250, 203, 255, 247, 254},
   {243, 182, 197, 251, 204, 255, 247, 254},
+  {243, 183, 197, 251, 204, 255, 247, 254},
   {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
   {244, 185, 199, 251, 206, 255, 247, 254},
   {245, 186, 200, 251, 207, 255, 247, 254},
+  {245, 187, 200, 251, 207, 255, 247, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
   {246, 188, 201, 252, 207, 255, 248, 254},
   {246, 189, 202, 252, 208, 255, 248, 254},
+  {246, 190, 202, 252, 208, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
   {247, 191, 203, 252, 209, 255, 248, 254},
   {247, 192, 204, 252, 210, 255, 248, 254},
+  {247, 193, 204, 252, 210, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
   {248, 194, 205, 252, 211, 255, 248, 254},
   {248, 195, 206, 252, 212, 255, 249, 254},
+  {248, 196, 206, 252, 212, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
   {249, 197, 207, 253, 213, 255, 249, 254},
   {249, 198, 208, 253, 214, 255, 249, 254},
+  {249, 199, 209, 253, 214, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
   {250, 200, 210, 253, 215, 255, 249, 254},
   {250, 201, 211, 253, 215, 255, 249, 254},
+  {250, 202, 211, 253, 215, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
   {250, 203, 212, 253, 216, 255, 249, 254},
   {251, 204, 213, 253, 217, 255, 250, 254},
+  {251, 205, 213, 253, 217, 255, 250, 254},
   {251, 206, 214, 254, 218, 255, 250, 254},
+  {251, 206, 215, 254, 218, 255, 250, 254},
   {252, 207, 216, 254, 219, 255, 250, 254},
+  {252, 208, 216, 254, 219, 255, 250, 254},
   {252, 209, 217, 254, 220, 255, 250, 254},
+  {252, 210, 217, 254, 220, 255, 250, 254},
   {252, 211, 218, 254, 221, 255, 250, 254},
+  {252, 212, 218, 254, 221, 255, 250, 254},
   {253, 213, 219, 254, 222, 255, 250, 254},
+  {253, 213, 220, 254, 222, 255, 250, 254},
   {253, 214, 221, 254, 223, 255, 250, 254},
+  {253, 215, 221, 254, 223, 255, 250, 254},
   {253, 216, 222, 254, 224, 255, 251, 254},
+  {253, 217, 223, 254, 224, 255, 251, 254},
   {253, 218, 224, 254, 225, 255, 251, 254},
+  {253, 219, 224, 254, 225, 255, 251, 254},
   {254, 220, 225, 254, 225, 255, 251, 254},
+  {254, 221, 226, 254, 225, 255, 251, 254},
   {254, 222, 227, 255, 226, 255, 251, 254},
+  {254, 223, 227, 255, 226, 255, 251, 254},
   {254, 224, 228, 255, 227, 255, 251, 254},
+  {254, 225, 229, 255, 227, 255, 251, 254},
   {254, 226, 230, 255, 228, 255, 251, 254},
+  {254, 227, 230, 255, 229, 255, 251, 254},
   {255, 228, 231, 255, 230, 255, 251, 254},
+  {255, 229, 232, 255, 230, 255, 251, 254},
   {255, 230, 233, 255, 231, 255, 252, 254},
+  {255, 231, 234, 255, 231, 255, 252, 254},
   {255, 232, 235, 255, 232, 255, 252, 254},
+  {255, 233, 236, 255, 232, 255, 252, 254},
   {255, 235, 237, 255, 233, 255, 252, 254},
+  {255, 236, 238, 255, 234, 255, 252, 254},
   {255, 238, 240, 255, 235, 255, 252, 255},
+  {255, 239, 241, 255, 235, 255, 252, 254},
   {255, 241, 243, 255, 236, 255, 252, 254},
-  {255, 246, 247, 255, 239, 255, 253, 255}
+  {255, 243, 245, 255, 237, 255, 252, 254},
+  {255, 246, 247, 255, 239, 255, 253, 255},
+  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
-static void extend_model_to_full_distribution(vp9_prob p,
-                                              vp9_prob *tree_probs) {
-  const int l = (p - 1) / 2;
-  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
-  if (p & 1) {
-    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
-               model[l], MODEL_NODES * sizeof(vp9_prob));
-  } else {
-    // interpolate
-    int i;
-    for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
-      tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +
-                       model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;
+static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
   }
-}
-
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
-  if (full != model)
-    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-  extend_model_to_full_distribution(model[PIVOT_NODE], full);
-}
+};
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
+};
 
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
+static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
+};
 
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
+static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
   }
+};
 
-  p[0] = p[1] = 0;
+static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
+  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
+             MODEL_NODES * sizeof(vp9_prob));
 }
 
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+  if (full != model)
+    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  { 0, 0, 0, 0},
-  { 0, 0, 0, 1},
-  { 0, 0, 0, 2},
-  { 0, 0, 0, 3},
-  { 0, 0, 0, 4},
-  { cat1, Pcat1, 1, 5},
-  { cat2, Pcat2, 2, 7},
-  { cat3, Pcat3, 3, 11},
-  { cat4, Pcat4, 4, 19},
-  { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 14, 67},
-  { 0, 0, 0, 0}
-};
-
-#include "vp9/common/vp9_default_coef_probs.h"
-
 void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
   vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
@@ -298,13 +726,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-// #define COEF_COUNT_TESTING
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24
@@ -316,31 +737,30 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
-  const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
-  vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
-  unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+  vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+  const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
+  vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
+  unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cm->counts.eob_branch[tx_size];
-  int t, i, j, k, l;
-  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
-  vp9_prob coef_probs[UNCONSTRAINED_NODES];
+  int i, j, k, l, m;
 
-  for (i = 0; i < BLOCK_TYPES; ++i)
+  for (i = 0; i < PLANE_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
-          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
-                                           branch_ct, coef_counts[i][j][k][l],
-                                           0);
-          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
-            dst_coef_probs[i][j][k][l][t] = merge_probs(
-                pre_coef_probs[i][j][k][l][t], coef_probs[t],
-                branch_ct[t], count_sat, update_factor);
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          const int n0 = counts[i][j][k][l][ZERO_TOKEN];
+          const int n1 = counts[i][j][k][l][ONE_TOKEN];
+          const int n2 = counts[i][j][k][l][TWO_TOKEN];
+          const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN];
+          const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = {
+            { neob, eob_counts[i][j][k][l] - neob },
+            { n0, n1 + n2 },
+            { n1, n2 }
+          };
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m],
+                                               branch_ct[m],
+                                               count_sat, update_factor);
         }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
index ec7d09a00b0..3dc98a835fc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
@@ -16,57 +16,50 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_scan.h"
-#include "vp9/common/vp9_treecoder.h"
 
-#define DIFF_UPDATE_PROB 252
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-/* Coefficient token alphabet */
+#define DIFF_UPDATE_PROB 252
 
-#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
-#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
-#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
-#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
-#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
-#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
-#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 14+1 */
-#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
-#define MAX_ENTROPY_TOKENS      12
-#define ENTROPY_NODES           11
-#define EOSB_TOKEN              127     /* Not signalled, encoder only */
+// Coefficient token alphabet
+#define ZERO_TOKEN      0   // 0     Extra Bits 0+0
+#define ONE_TOKEN       1   // 1     Extra Bits 0+1
+#define TWO_TOKEN       2   // 2     Extra Bits 0+1
+#define THREE_TOKEN     3   // 3     Extra Bits 0+1
+#define FOUR_TOKEN      4   // 4     Extra Bits 0+1
+#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
+#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
+#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
+#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
+#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
+#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
+#define EOB_TOKEN       11  // EOB   Extra Bits 0+0
 
-#define INTER_MODE_CONTEXTS     7
+#define ENTROPY_TOKENS 12
 
-extern DECLARE_ALIGNED(16, const uint8_t,
-                       vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+#define ENTROPY_NODES 11
 
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 
-#define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
+#define EOB_MODEL_TOKEN 3
 extern const vp9_tree_index vp9_coefmodel_tree[];
 
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
 typedef struct {
-  vp9_tree_index *tree;
+  const vp9_tree_index *tree;
   const vp9_prob *prob;
   int len;
   int base_val;
 } vp9_extra_bit;
 
 // indexed by token value
-extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
 
-#define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
-/* Outside dimension.  0 = Y with DC, 1 = UV */
-#define BLOCK_TYPES 2
 #define REF_TYPES 2  // intra=0, inter=1
 
 /* Middle dimension reflects the coefficient position within the transform. */
@@ -88,13 +81,14 @@ extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
-#define PREV_COEF_CONTEXTS          6
+#define COEFF_CONTEXTS 6
+#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS)
 
 // #define ENTROPY_STATS
 
-typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                                    [MAX_ENTROPY_TOKENS];
-typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
                                     [ENTROPY_NODES][2];
 
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
@@ -102,8 +96,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -120,41 +112,41 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
-// For 4x4 blocks the index is less but to keep things common the lookup
-// table for 4x4 is padded out to this index.
+// This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
-extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
-
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
 
-static int get_coef_band(const uint8_t * band_translate, int coef_index) {
-  return (coef_index > MAXBAND_INDEX)
-    ? (COEF_BANDS-1) : band_translate[coef_index];
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+                           : vp9_coefband_trans_8x8plus;
 }
 
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 
-#define COEFPROB_MODELS             128
+#define COEFF_PROB_MODELS 256
 
 #define UNCONSTRAINED_NODES         3
 
 #define PIVOT_NODE                  2   // which node is pivot
 
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
-                                      [PREV_COEF_CONTEXTS]
-                                      [UNCONSTRAINED_NODES];
+                                      [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
 
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
-                                          [PREV_COEF_CONTEXTS]
+                                          [COEFF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static int get_entropy_context(TX_SIZE tx_size,
-                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
@@ -163,49 +155,38 @@ static int get_entropy_context(TX_SIZE tx_size,
       left_ec = l[0] != 0;
       break;
     case TX_8X8:
-      above_ec = !!*(uint16_t *)a;
-      left_ec  = !!*(uint16_t *)l;
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
       break;
     case TX_16X16:
-      above_ec = !!*(uint32_t *)a;
-      left_ec  = !!*(uint32_t *)l;
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
       break;
     case TX_32X32:
-      above_ec = !!*(uint64_t *)a;
-      left_ec  = !!*(uint64_t *)l;
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
       break;
     default:
-      assert(!"Invalid transform size.");
+      assert(0 && "Invalid transform size.");
   }
 
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
-                           : vp9_coefband_trans_8x8plus;
-}
+static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
+  const MODE_INFO *const mi = xd->mi[0];
 
-static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                     PLANE_TYPE type, int block_idx,
-                     const int16_t **scan, const int16_t **scan_nb) {
-  switch (tx_size) {
-    case TX_4X4:
-      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
-      break;
-    case TX_8X8:
-      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
-      break;
-    case TX_16X16:
-      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
-      break;
-    case TX_32X32:
-      *scan = vp9_default_scan_32x32;
-      *scan_nb = vp9_default_scan_32x32_neighbors;
-      break;
-    default:
-      assert(!"Invalid transform size.");
+  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+    return &vp9_default_scan_orders[tx_size];
+  } else {
+    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
+    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
   }
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
index 21c91d6e35e..5b00b0082a1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
@@ -10,7 +10,6 @@
 
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
@@ -161,51 +160,52 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS]
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
-  {  // frame_type = keyframe
-    // 8x8 -> 4x4
-    { 158,  97,  94 },  // a/l both not split
-    {  93,  24,  99 },  // a split, l not split
-    {  85, 119,  44 },  // l split, a not split
-    {  62,  59,  67 },  // a/l both split
-    // 16x16 -> 8x8
-    { 149,  53,  53 },  // a/l both not split
-    {  94,  20,  48 },  // a split, l not split
-    {  83,  53,  24 },  // l split, a not split
-    {  52,  18,  18 },  // a/l both split
-    // 32x32 -> 16x16
-    { 150,  40,  39 },  // a/l both not split
-    {  78,  12,  26 },  // a split, l not split
-    {  67,  33,  11 },  // l split, a not split
-    {  24,   7,   5 },  // a/l both split
-    // 64x64 -> 32x32
-    { 174,  35,  49 },  // a/l both not split
-    {  68,  11,  27 },  // a split, l not split
-    {  57,  15,   9 },  // l split, a not split
-    {  12,   3,   3 },  // a/l both split
-  }, {  // frame_type = interframe
-    // 8x8 -> 4x4
-    { 199, 122, 141 },  // a/l both not split
-    { 147,  63, 159 },  // a split, l not split
-    { 148, 133, 118 },  // l split, a not split
-    { 121, 104, 114 },  // a/l both split
-    // 16x16 -> 8x8
-    { 174,  73,  87 },  // a/l both not split
-    {  92,  41,  83 },  // a split, l not split
-    {  82,  99,  50 },  // l split, a not split
-    {  53,  39,  39 },  // a/l both split
-    // 32x32 -> 16x16
-    { 177,  58,  59 },  // a/l both not split
-    {  68,  26,  63 },  // a split, l not split
-    {  52,  79,  25 },  // l split, a not split
-    {  17,  14,  12 },  // a/l both split
-    // 64x64 -> 32x32
-    { 222,  34,  30 },  // a/l both not split
-    {  72,  16,  44 },  // a split, l not split
-    {  58,  32,  12 },  // l split, a not split
-    {  10,   7,   6 },  // a/l both split
-  }
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
 };
 
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -231,21 +231,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, -NEWMV
+  -INTER_OFFSET(ZEROMV), 2,
+  -INTER_OFFSET(NEARESTMV), 4,
+  -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
@@ -305,7 +302,7 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
+static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
@@ -317,17 +314,18 @@ static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
   { 149, 144, },
 };
 
-void vp9_init_mbmode_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
-  vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob);
-  vp9_copy(cm->fc.partition_prob, default_partition_probs);
-  vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
-  vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
-  vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
-  vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
-  cm->fc.tx_probs = default_tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
+  vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
+  vp9_copy(fc->y_mode_prob, default_if_y_probs);
+  vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
+  vp9_copy(fc->partition_prob, default_partition_probs);
+  vp9_copy(fc->intra_inter_prob, default_intra_inter_p);
+  vp9_copy(fc->comp_inter_prob, default_comp_inter_p);
+  vp9_copy(fc->comp_ref_prob, default_comp_ref_p);
+  vp9_copy(fc->single_ref_prob, default_single_ref_p);
+  fc->tx_probs = default_tx_probs;
+  vp9_copy(fc->skip_probs, default_skip_probs);
+  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
 const vp9_tree_index vp9_switchable_interp_tree
@@ -335,43 +333,19 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
-                              vp9_inter_mode_tree, NEARESTMV);
-}
 
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
-                     const unsigned int ct[2]) {
-  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
-static void update_mode_probs(int n_modes,
-                              const vp9_tree_index *tree,
-                              const unsigned int *cnt,
-                              const vp9_prob *pre_probs, vp9_prob *dst_probs,
-                              unsigned int tok0_offset) {
-#define MAX_PROBS 32
-  vp9_prob probs[MAX_PROBS];
-  unsigned int branch_ct[MAX_PROBS][2];
-  int t;
-
-  assert(n_modes - 1 < MAX_PROBS);
-  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
-  for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+                        const vp9_prob *pre_probs, const unsigned int *counts,
+                        vp9_prob *probs) {
+  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+                   probs);
 }
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -381,46 +355,39 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
                                          counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
                                         counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
                                       counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
-                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
-                      fc->inter_mode_probs[i], NEARESTMV);
+    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->y_mode[i], pre_fc->y_mode_prob[i],
-                      fc->y_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
-                      fc->uv_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      counts->partition[i],
-                      pre_fc->partition_prob[INTER_FRAME][i],
-                      fc->partition_prob[INTER_FRAME][i], 0);
+    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                counts->partition[i], fc->partition_prob[i]);
 
-  if (cm->mcomp_filter_type == SWITCHABLE) {
+  if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        counts->switchable_interp[i],
-                        pre_fc->switchable_interp_prob[i],
-                        fc->switchable_interp_prob[i], 0);
+      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -432,24 +399,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
   }
 
-  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
-                                     counts->mbskip[i]);
+  for (i = 0; i < SKIP_CONTEXTS; ++i)
+    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -485,27 +451,24 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mbmode_probs(cm);
+  vp9_init_mode_probs(&cm->fc);
   vp9_init_mv_probs(cm);
-  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
-    for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
+    for (i = 0; i < FRAME_CONTEXTS; ++i)
       cm->frame_contexts[i] = cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
-  vpx_memset(cm->prev_mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
-  vpx_memset(cm->mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+  if (frame_is_intra_only(cm))
+    vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
+                                    sizeof(*cm->prev_mip));
 
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
 
   vp9_zero(cm->ref_frame_sign_bias);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
index ea9655577be..533757bef03 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
@@ -12,14 +12,17 @@
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define TX_SIZE_CONTEXTS 2
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
-// #define MODE_STATS
-
 struct VP9Common;
 
 struct tx_probs {
@@ -34,28 +37,56 @@ struct tx_counts {
   unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };
 
+typedef struct frame_contexts {
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                 [SWITCHABLE_FILTERS - 1];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vp9_prob single_ref_prob[REF_CONTEXTS][2];
+  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  struct tx_probs tx_probs;
+  vp9_prob skip_probs[SKIP_CONTEXTS];
+  nmv_context nmvc;
+} FRAME_CONTEXT;
+
+typedef struct {
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                         [COEF_BANDS][COEFF_CONTEXTS];
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][2][2];
+  unsigned int comp_ref[REF_CONTEXTS][2];
+  struct tx_counts tx;
+  unsigned int skip[SKIP_CONTEXTS][2];
+  nmv_context_counts mv;
+} FRAME_COUNTS;
+
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mbmode_probs(struct VP9Common *cm);
+void vp9_init_mode_probs(FRAME_CONTEXT *fc);
 
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
@@ -66,4 +97,17 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
 void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
+static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  return vp9_kf_y_mode_prob[above][left];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
index f70b571efe3..5bb048202b7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
@@ -8,14 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
 #define MV_COUNT_SAT 20
 #define MV_MAX_UPDATE_FACTOR 128
 
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
+// Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
 const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
@@ -23,7 +22,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
-struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
 const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
@@ -37,48 +35,43 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_7, -MV_CLASS_8,
   -MV_CLASS_9, -MV_CLASS_10,
 };
-struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
 const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
-struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
 };
-struct vp9_token vp9_mv_fp_encodings[4];
 
 static const nmv_context default_nmv_context = {
   {32, 64, 96},
-  { // NOLINT
-    { /* vert component */ // NOLINT
-      128,                                                  /* sign */
-      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */
-      {216},                                                /* class0 */
-      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */
-      {64, 96, 64},                                         /* fp */
-      160,                                                  /* class0_hp bit */
-      128,                                                  /* hp */
+  {
+    { // Vertical component
+      128,                                                  // sign
+      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   // class
+      {216},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
     },
-    { /* hor component */ // NOLINT
-      128,                                                  /* sign */
-      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */
-      {208},                                                /* class0 */
-      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */
-      {64, 96, 64},                                         /* fp */
-      160,                                                  /* class0_hp bit */
-      128,                                                  /* hp */
+    { // Horizontal component
+      128,                                                  // sign
+      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   // class
+      {208},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
     }
   },
 };
 
-#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
-
 static const uint8_t log_in_base_2[] = {
   0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -125,13 +118,13 @@ static const uint8_t log_in_base_2[] = {
   9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
 };
 
-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
-  MV_CLASS_TYPE c = MV_CLASS_0;
-  if (z >= CLASS0_SIZE * 4096)
-    c = MV_CLASS_10;
-  else
-    c = log_in_base_2[z >> 3];
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
 
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ?
+      MV_CLASS_10 : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
   if (offset)
     *offset = z - mv_class_base(c);
   return c;
@@ -191,71 +184,50 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-static unsigned int adapt_probs(unsigned int i,
-                                vp9_tree tree,
-                                vp9_prob this_probs[],
-                                const vp9_prob last_probs[],
-                                const unsigned int num_events[]) {
-  const unsigned int left = tree[i] <= 0
-          ? num_events[-tree[i]]
-          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
-  const unsigned int right = tree[i + 1] <= 0
-          ? num_events[-tree[i + 1]]
-          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-  const unsigned int ct[2] = { left, right };
-  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
-  return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                        const unsigned int *counts, vp9_prob *probs) {
+  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
+                       MV_MAX_UPDATE_FACTOR, probs);
 }
 
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  nmv_context *ctx = &cm->fc.nmvc;
-  const nmv_context *pre_ctx = &pre_fc->nmvc;
-  const nmv_context_counts *cts = &cm->counts.mv;
+  nmv_context *fc = &cm->fc.nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
 
   for (i = 0; i < 2; ++i) {
-    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
-                pre_ctx->comps[i].classes, cts->comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
-                pre_ctx->comps[i].class0, cts->comps[i].class0);
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = adapt_prob(pre_comp->sign, c->sign);
+    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                comp->classes);
+    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
-                                           cts->comps[i].bits[j]);
+      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
-                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+                  comp->class0_fp[j]);
 
-    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
-                cts->comps[i].fp);
+    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
-                                           cts->comps[i].class0_hp);
-      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = adapt_prob(pre_comp->hp, c->hp);
     }
   }
 }
 
-void vp9_entropy_mv_init() {
-  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
-  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
-  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
-  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
 void vp9_init_mv_probs(VP9_COMMON *cm) {
   cm->fc.nmvc = default_nmv_context;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.h
index d843f5bfee1..e7033e437bd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.h
@@ -12,19 +12,21 @@
 #ifndef VP9_COMMON_VP9_ENTROPYMV_H_
 #define VP9_COMMON_VP9_ENTROPYMV_H_
 
-#include "vp9/common/vp9_treecoder.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 
-void vp9_entropy_mv_init();
 void vp9_init_mv_probs(struct VP9Common *cm);
 
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_mv_hp(const MV *ref);
 
-#define NMV_UPDATE_PROB  252
+#define MV_UPDATE_PROB 252
 
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
@@ -62,6 +64,7 @@ typedef enum {
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_FP_SIZE 4
 
 #define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
 #define MV_MAX         ((1 << MV_MAX_BITS) - 1)
@@ -71,25 +74,18 @@ typedef enum {
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
-extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
-extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
-extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
-extern struct vp9_token vp9_mv_fp_encodings[4];
+extern const vp9_tree_index vp9_mv_joint_tree[];
+extern const vp9_tree_index vp9_mv_class_tree[];
+extern const vp9_tree_index vp9_mv_class0_tree[];
+extern const vp9_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
   vp9_prob sign;
   vp9_prob classes[MV_CLASSES - 1];
   vp9_prob class0[CLASS0_SIZE - 1];
   vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
-  vp9_prob fp[4 - 1];
+  vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vp9_prob fp[MV_FP_SIZE - 1];
   vp9_prob class0_hp;
   vp9_prob hp;
 } nmv_component;
@@ -116,8 +112,8 @@ typedef struct {
   unsigned int classes[MV_CLASSES];
   unsigned int class0[CLASS0_SIZE];
   unsigned int bits[MV_OFFSET_BITS][2];
-  unsigned int class0_fp[CLASS0_SIZE][4];
-  unsigned int fp[4];
+  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
+  unsigned int fp[MV_FP_SIZE];
   unsigned int class0_hp[2];
   unsigned int hp[2];
 } nmv_component_counts;
@@ -129,4 +125,8 @@ typedef struct {
 
 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
index 1651b905055..068284faa90 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
@@ -13,6 +13,10 @@
 
 #include "./vpx_config.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MI_SIZE_LOG2 3
 #define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
 
@@ -21,6 +25,23 @@
 
 #define MI_MASK (MI_BLOCK_SIZE - 1)
 
+// Bitstream profiles indicated by 2 bits in the uncompressed header.
+// 00: Profile 0. 4:2:0 only.
+// 10: Profile 1. adds 4:4:4, 4:2:2, alpha.
+// 01: Profile 2. Supports 10-bit and 12-bit color only.
+// 11: Undefined profile.
+typedef enum BITSTREAM_PROFILE {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  MAX_PROFILES
+} BITSTREAM_PROFILE;
+
+typedef enum BIT_DEPTH {
+  BITS_8,
+  BITS_10,
+  BITS_12
+} BIT_DEPTH;
 
 typedef enum BLOCK_SIZE {
   BLOCK_4X4,
@@ -52,20 +73,22 @@ typedef enum PARTITION_TYPE {
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
+// block transform size
 typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_4X4 = 0,                      // 4x4 transform
+  TX_8X8 = 1,                      // 8x8 transform
+  TX_16X16 = 2,                    // 16x16 transform
+  TX_32X32 = 3,                    // 32x32 transform
   TX_SIZES
 } TX_SIZE;
 
+// frame transform mode
 typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
   TX_MODES            = 5,
 } TX_MODE;
 
@@ -73,7 +96,8 @@ typedef enum {
   DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
   ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
   DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3                       // ADST in both directions
+  ADST_ADST = 3,                      // ADST in both directions
+  TX_TYPES = 4
 } TX_TYPE;
 
 typedef enum {
@@ -87,4 +111,14 @@ typedef enum {
   SRGB       = 7   // RGB
 } COLOR_SPACE;
 
+typedef enum {
+  VP9_LAST_FLAG = 1 << 0,
+  VP9_GOLD_FLAG = 1 << 1,
+  VP9_ALT_FLAG = 1 << 2,
+} VP9_REFFRAME;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.c
deleted file mode 100644
index 07c68c84a89..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_extend.h"
-
-static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
-                                  uint8_t *dst, int dst_pitch,
-                                  int w, int h,
-                                  int extend_top, int extend_left,
-                                  int extend_bottom, int extend_right) {
-  int i, linesize;
-
-  // copy the left and right most columns out
-  const uint8_t *src_ptr1 = src;
-  const uint8_t *src_ptr2 = src + w - 1;
-  uint8_t *dst_ptr1 = dst - extend_left;
-  uint8_t *dst_ptr2 = dst + w;
-
-  for (i = 0; i < h; i++) {
-    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);
-    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
-    src_ptr1 += src_pitch;
-    src_ptr2 += src_pitch;
-    dst_ptr1 += dst_pitch;
-    dst_ptr2 += dst_pitch;
-  }
-
-  // Now copy the top and bottom lines into each line of the respective
-  // borders
-  src_ptr1 = dst - extend_left;
-  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
-  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
-  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
-  linesize = extend_left + extend_right + w;
-
-  for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
-    dst_ptr1 += dst_pitch;
-  }
-
-  for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
-    dst_ptr2 += dst_pitch;
-  }
-}
-
-void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                               YV12_BUFFER_CONFIG *dst) {
-  // Extend src frame in buffer
-  // Altref filtering assumes 16 pixel extension
-  const int et_y = 16;
-  const int el_y = 16;
-  // Motion estimation may use src block variance with the block size up
-  // to 64x64, so the right and bottom need to be extended to 64 mulitple
-  // or up to 16, whichever is greater.
-  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
-                       16);
-  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
-                       16);
-  const int uv_width_subsampling = (src->uv_width != src->y_width);
-  const int uv_height_subsampling = (src->uv_height != src->y_height);
-  const int et_uv = et_y >> uv_height_subsampling;
-  const int el_uv = el_y >> uv_width_subsampling;
-  const int eb_uv = eb_y >> uv_height_subsampling;
-  const int er_uv = er_y >> uv_width_subsampling;
-
-#if CONFIG_ALPHA
-  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
-  const int el_a = dst->border >> (dst->alpha_width != dst->y_width);
-  const int eb_a = et_a + dst->alpha_height - src->alpha_height;
-  const int er_a = el_a + dst->alpha_width - src->alpha_width;
-
-  copy_and_extend_plane(src->alpha_buffer, src->alpha_stride,
-                        dst->alpha_buffer, dst->alpha_stride,
-                        src->alpha_width, src->alpha_height,
-                        et_a, el_a, eb_a, er_a);
-#endif
-
-  copy_and_extend_plane(src->y_buffer, src->y_stride,
-                        dst->y_buffer, dst->y_stride,
-                        src->y_width, src->y_height,
-                        et_y, el_y, eb_y, er_y);
-
-  copy_and_extend_plane(src->u_buffer, src->uv_stride,
-                        dst->u_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
-                        et_uv, el_uv, eb_uv, er_uv);
-
-  copy_and_extend_plane(src->v_buffer, src->uv_stride,
-                        dst->v_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
-                        et_uv, el_uv, eb_uv, er_uv);
-}
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst,
-                                         int srcy, int srcx,
-                                         int srch, int srcw) {
-  // If the side is not touching the bounder then don't extend.
-  const int et_y = srcy ? 0 : dst->border;
-  const int el_y = srcx ? 0 : dst->border;
-  const int eb_y = srcy + srch != src->y_height ? 0 :
-                      dst->border + dst->y_height - src->y_height;
-  const int er_y = srcx + srcw != src->y_width ? 0 :
-                      dst->border + dst->y_width - src->y_width;
-  const int src_y_offset = srcy * src->y_stride + srcx;
-  const int dst_y_offset = srcy * dst->y_stride + srcx;
-
-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
-  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
-                        dst->y_buffer + dst_y_offset, dst->y_stride,
-                        srcw, srch,
-                        et_y, el_y, eb_y, er_y);
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
-                        dst->u_buffer + dst_uv_offset, dst->uv_stride,
-                        srcw_uv, srch_uv,
-                        et_uv, el_uv, eb_uv, er_uv);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
-                        dst->v_buffer + dst_uv_offset, dst->uv_stride,
-                        srcw_uv, srch_uv,
-                        et_uv, el_uv, eb_uv, er_uv);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.h
deleted file mode 100644
index 7ff79b7b6b3..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_extend.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_EXTEND_H_
-#define VP9_COMMON_VP9_EXTEND_H_
-
-#include "vpx_scale/yv12config.h"
-#include "vpx/vpx_integer.h"
-
-
-void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                               YV12_BUFFER_CONFIG *dst);
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst,
-                                         int srcy, int srcx,
-                                         int srch, int srcw);
-#endif  // VP9_COMMON_VP9_EXTEND_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c
index 8f24052c7aa..7474a88bcce 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c
@@ -10,12 +10,9 @@
 
 #include <assert.h>
 
-#include "vpx_ports/mem.h"
-
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const subpel_kernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -35,8 +32,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const subpel_kernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -56,8 +52,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const subpel_kernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -77,8 +72,7 @@ DECLARE_ALIGNED(256, const subpel_kernel,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const subpel_kernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -97,19 +91,16 @@ DECLARE_ALIGNED(256, const subpel_kernel,
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
-  switch (type) {
-    case EIGHTTAP:
-      return vp9_sub_pel_filters_8;
-    case EIGHTTAP_SMOOTH:
-      return vp9_sub_pel_filters_8lp;
-    case EIGHTTAP_SHARP:
-      return vp9_sub_pel_filters_8s;
-    case BILINEAR:
-      return vp9_bilinear_filters;
-    default:
-      assert(!"Invalid interpolation type.");
-      return NULL;
-  }
+
+static const InterpKernel* vp9_filter_kernels[4] = {
+  vp9_sub_pel_filters_8,
+  vp9_sub_pel_filters_8lp,
+  vp9_sub_pel_filters_8s,
+  vp9_bilinear_filters
+};
+
+const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
+  assert(filter != SWITCHABLE);
+  return vp9_filter_kernels[filter];
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
index 8652a6e3be5..29d3867c931 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
@@ -13,6 +13,12 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define FILTER_BITS 7
 
@@ -27,26 +33,28 @@ typedef enum {
   EIGHTTAP_SHARP = 2,
   BILINEAR = 3,
   SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATION_TYPE;
+} INTERP_FILTER;
 
-typedef int16_t subpel_kernel[SUBPEL_TAPS];
+typedef int16_t InterpKernel[SUBPEL_TAPS];
 
-struct subpix_fn_table {
-  const subpel_kernel *filter_x;
-  const subpel_kernel *filter_y;
-};
+const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
-
-extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]);
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
 #define BILINEAR_FILTERS_2TAP(x) \
   (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.c
deleted file mode 100644
index b91c501435e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_mvref_common.h"
-
-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
-  }
-}
-
-
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near) {
-  int i;
-  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-    clamp_mv2(&mvlist[i].as_mv, xd);
-  }
-  *nearest = mvlist[0];
-  *near = mvlist[1];
-}
-
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
-                                   int_mv *dst_nearest,
-                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col) {
-  int_mv dst_list[MAX_MV_REF_CANDIDATES];
-  int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *const mi = xd->mi_8x8[0];
-
-  assert(ref_idx == 0 || ref_idx == 1);
-  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
-
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
-                       mi->mbmi.ref_frame[ref_idx],
-                       mv_list, block_idx, mi_row, mi_col);
-
-  dst_list[1].as_int = 0;
-  if (block_idx == 0) {
-    vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
-  } else if (block_idx == 1 || block_idx == 2) {
-    int dst = 0, n;
-    b_mode_info *bmi = mi->bmi;
-
-    dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
-    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
-                n < MAX_MV_REF_CANDIDATES; n++)
-      if (mv_list[n].as_int != dst_list[0].as_int)
-        dst_list[dst++].as_int = mv_list[n].as_int;
-  } else {
-    int dst = 0, n;
-    b_mode_info *bmi = mi->bmi;
-
-    assert(block_idx == 3);
-    dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
-    if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int)
-      dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int;
-    if (dst < MAX_MV_REF_CANDIDATES &&
-        dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int)
-      dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
-    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
-                n < MAX_MV_REF_CANDIDATES; n++)
-      if (mv_list[n].as_int != dst_list[0].as_int)
-        dst_list[dst++].as_int = mv_list[n].as_int;
-  }
-
-  dst_nearest->as_int = dst_list[0].as_int;
-  dst_near->as_int = dst_list[1].as_int;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.h
deleted file mode 100644
index 2362caa417d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_findnearmv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_VP9_FINDNEARMV_H_
-#define VP9_COMMON_VP9_FINDNEARMV_H_
-
-#include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_treecoder.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-#define LEFT_TOP_MARGIN     ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
-
-// check a list of motion vectors by sad score using a number rows of pixels
-// above and a number cols of pixels in the left to select the one with best
-// score to use as ref motion vector
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near);
-
-// TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-}
-
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
-                                   int_mv *dst_nearest,
-                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col);
-
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
-                                          const MODE_INFO *left_mi, int b) {
-  if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi))
-      return DC_PRED;
-
-    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                             : left_mi->mbmi.mode;
-  } else {
-    assert(b == 1 || b == 3);
-    return cur_mi->bmi[b - 1].as_mode;
-  }
-}
-
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
-                                           const MODE_INFO *above_mi, int b) {
-  if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi))
-      return DC_PRED;
-
-    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
-                                              : above_mi->mbmi.mode;
-  } else {
-    assert(b == 2 || b == 3);
-    return cur_mi->bmi[b - 2].as_mode;
-  }
-}
-
-#endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
new file mode 100644
index 00000000000..a0b1e039ca4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_frame_buffers.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  vp9_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers,
+                                        sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    vpx_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  vpx_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use)
+      break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers)
+    return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    int_fb_list->int_fb[i].data =
+        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
+    if (!int_fb_list->int_fb[i].data)
+      return -1;
+
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  int_fb->in_use = 0;
+  return 0;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.h
new file mode 100644
index 00000000000..e2cfe61b662
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb);
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
index ea8683ea16d..856d41e7001 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
@@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct4_1d(const int16_t *input, int16_t *output) {
+static void idct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -124,7 +124,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    idct4_1d(input, outptr);
+    idct4(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -133,7 +133,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    idct4_1d(temp_in, temp_out);
+    idct4(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
                                   + dest[j * stride + i]);
@@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct8_1d(const int16_t *input, int16_t *output) {
+static void idct8(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -174,7 +174,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) {
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  idct4_1d(step1, step1);
+  idct4(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -209,7 +209,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -218,7 +218,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
@@ -238,7 +238,7 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-static void iadst4_1d(const int16_t *input, int16_t *output) {
+static void iadst4(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -283,10 +283,10 @@ static void iadst4_1d(const int16_t *input, int16_t *output) {
 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
-    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
-    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
-    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
-    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
+    { idct4, idct4  },  // DCT_DCT  = 0
+    { iadst4, idct4  },   // ADST_DCT = 1
+    { idct4, iadst4 },    // DCT_ADST = 2
+    { iadst4, iadst4 }      // ADST_ADST = 3
   };
 
   int i, j;
@@ -311,7 +311,7 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                                   + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(const int16_t *input, int16_t *output) {
+static void iadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -389,10 +389,10 @@ static void iadst8_1d(const int16_t *input, int16_t *output) {
 }
 
 static const transform_2d IHT_8[] = {
-  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
-  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
-  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
-  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
+  { idct8,  idct8  },  // DCT_DCT  = 0
+  { iadst8, idct8  },  // ADST_DCT = 1
+  { idct8,  iadst8 },  // DCT_ADST = 2
+  { iadst8, iadst8 }   // ADST_ADST = 3
 };
 
 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
   }
 }
 
-void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -430,7 +430,7 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -439,14 +439,14 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(const int16_t *input, int16_t *output) {
+static void idct16(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -619,7 +619,7 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -628,14 +628,14 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
   }
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -807,10 +807,10 @@ static void iadst16_1d(const int16_t *input, int16_t *output) {
 }
 
 static const transform_2d IHT_16[] = {
-  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
-  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
-  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
-  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
+  { idct16,  idct16  },  // DCT_DCT  = 0
+  { iadst16, idct16  },  // ADST_DCT = 1
+  { idct16,  iadst16 },  // DCT_ADST = 2
+  { iadst16, iadst16 }   // ADST_ADST = 3
 };
 
 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -835,7 +835,8 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);  }
+                                        + dest[j * stride + i]);
+  }
 }
 
 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
@@ -847,7 +848,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -856,7 +857,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
@@ -876,7 +877,7 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-static void idct32_1d(const int16_t *input, int16_t *output) {
+static void idct32(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1262,7 +1263,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      idct32_1d(input, outptr);
+      idct32(input, outptr);
     else
       vpx_memset(outptr, 0, sizeof(int16_t) * 32);
     input += 32;
@@ -1273,10 +1274,10 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+                                        + dest[j * stride + i]);
   }
 }
 
@@ -1289,7 +1290,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
   // Rows
   // only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
-    idct32_1d(input, outptr);
+    idct32(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1298,7 +1299,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
@@ -1344,43 +1345,37 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   // coefficients. Use eobs to decide what to do.
   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
   // Combine that with code here.
-  if (eob) {
-    if (eob == 1)
-      // DC only DCT coefficient
-      vp9_idct8x8_1_add(input, dest, stride);
-    else if (eob <= 10)
-      vp9_idct8x8_10_add(input, dest, stride);
-    else
-      vp9_idct8x8_64_add(input, dest, stride);
-  }
+  if (eob == 1)
+    // DC only DCT coefficient
+    vp9_idct8x8_1_add(input, dest, stride);
+  else if (eob <= 12)
+    vp9_idct8x8_12_add(input, dest, stride);
+  else
+    vp9_idct8x8_64_add(input, dest, stride);
 }
 
 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
                        int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
-  if (eob) {
-    if (eob == 1)
-      /* DC only DCT coefficient. */
-      vp9_idct16x16_1_add(input, dest, stride);
-    else if (eob <= 10)
-      vp9_idct16x16_10_add(input, dest, stride);
-    else
-      vp9_idct16x16_256_add(input, dest, stride);
-  }
+  if (eob == 1)
+    /* DC only DCT coefficient. */
+    vp9_idct16x16_1_add(input, dest, stride);
+  else if (eob <= 10)
+    vp9_idct16x16_10_add(input, dest, stride);
+  else
+    vp9_idct16x16_256_add(input, dest, stride);
 }
 
 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
                        int eob) {
-  if (eob) {
-    if (eob == 1)
-      vp9_idct32x32_1_add(input, dest, stride);
-    else if (eob <= 34)
-      // non-zero coeff only in upper-left 8x8
-      vp9_idct32x32_34_add(input, dest, stride);
-    else
-      vp9_idct32x32_1024_add(input, dest, stride);
-  }
+  if (eob == 1)
+    vp9_idct32x32_1_add(input, dest, stride);
+  else if (eob <= 34)
+    // non-zero coeff only in upper-left 8x8
+    vp9_idct32x32_34_add(input, dest, stride);
+  else
+    vp9_idct32x32_1024_add(input, dest, stride);
 }
 
 // iht
@@ -1397,9 +1392,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
   } else {
-    if (eob > 0) {
-      vp9_iht8x8_64_add(input, dest, stride, tx_type);
-    }
+    vp9_iht8x8_64_add(input, dest, stride, tx_type);
   }
 }
 
@@ -1408,8 +1401,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
   } else {
-    if (eob > 0) {
-      vp9_iht16x16_256_add(input, dest, stride, tx_type);
-    }
+    vp9_iht16x16_256_add(input, dest, stride, tx_type);
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
index 2b3f35f0a3b..d8687762244 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
@@ -18,6 +18,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
@@ -29,9 +33,6 @@
 #define pair_set_epi16(a, b) \
   _mm_set_epi16(b, a, b, a, b, a, b, a)
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32(b, a, b, a)
-
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
@@ -77,8 +78,7 @@ static const int sinpi_4_9 = 15212;
 
 static INLINE int dct_const_round_shift(int input) {
   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
+  return (int16_t)rv;
 }
 
 typedef void (*transform_1d)(const int16_t*, int16_t*);
@@ -104,4 +104,8 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
                       int stride, int eob);
 
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
index 218e12e62d1..efd0249f423 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
@@ -16,24 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t int_4x4_uv;
-} LOOP_FILTER_MASK;
-
 // 64 bit masks for left transform size.  Each 1 represents a position where
 // we should apply a loop filter across the left border of an 8x8 block
 // boundary.
@@ -219,23 +201,10 @@ static const uint16_t size_mask_uv[BLOCK_SIZES] = {
 static const uint16_t left_border_uv =  0x1111;
 static const uint16_t above_border_uv = 0x000f;
 
-
-static void lf_init_lut(loop_filter_info_n *lfi) {
-  lfi->mode_lf_lut[DC_PRED] = 0;
-  lfi->mode_lf_lut[D45_PRED] = 0;
-  lfi->mode_lf_lut[D135_PRED] = 0;
-  lfi->mode_lf_lut[D117_PRED] = 0;
-  lfi->mode_lf_lut[D153_PRED] = 0;
-  lfi->mode_lf_lut[D207_PRED] = 0;
-  lfi->mode_lf_lut[D63_PRED] = 0;
-  lfi->mode_lf_lut[V_PRED] = 0;
-  lfi->mode_lf_lut[H_PRED] = 0;
-  lfi->mode_lf_lut[TM_PRED] = 0;
-  lfi->mode_lf_lut[ZEROMV]  = 0;
-  lfi->mode_lf_lut[NEARESTMV] = 1;
-  lfi->mode_lf_lut[NEARMV] = 1;
-  lfi->mode_lf_lut[NEWMV] = 1;
-}
+static const int mode_lf_lut[MB_MODE_COUNT] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+};
 
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
@@ -259,6 +228,12 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   }
 }
 
+static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
+                                const MB_MODE_INFO *mbmi) {
+  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
+                   [mode_lf_lut[mbmi->mode]];
+}
+
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
@@ -268,9 +243,6 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
   update_sharpness(lfi, lf->sharpness_level);
   lf->last_sharpness_level = lf->sharpness_level;
 
-  // init LUT for lvl  and hev thr picking
-  lf_init_lut(lfi);
-
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
     vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
@@ -281,10 +253,10 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  const int n_shift = default_filt_lvl >> 5;
+  const int scale = 1 << (default_filt_lvl >> 5);
   loop_filter_info_n *const lfi = &cm->lf_info;
   struct loopfilter *const lf = &cm->lf;
-  struct segmentation *const seg = &cm->seg;
+  const struct segmentation *const seg = &cm->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -293,91 +265,130 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   }
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
-
-    // Set the baseline filter values for each segment
+    int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
-                  ? data
-                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
     }
 
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
       vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
-      continue;
-    }
-
-    intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
-    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
-      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-        const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
-                                      + lf->mode_deltas[mode] * (1 << n_shift);
-        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale
+                                        + lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
       }
+    }
   }
 }
 
-static int build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi,
-                     const loop_filter_thresh **lfi) {
-  const int seg = mbmi->segment_id;
-  const int ref = mbmi->ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-
-  if (filter_level > 0) {
-    *lfi = &lfi_n->lfthr[filter_level];
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-static void filter_selectively_vert(uint8_t *s, int pitch,
-                                    unsigned int mask_16x16,
-                                    unsigned int mask_8x8,
-                                    unsigned int mask_4x4,
-                                    unsigned int mask_4x4_int,
-                                    const loop_filter_thresh **p_lfi) {
+static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+                                         uint8_t *s, int pitch,
+                                         unsigned int mask_16x16_l,
+                                         unsigned int mask_8x8_l,
+                                         unsigned int mask_4x4_l,
+                                         unsigned int mask_4x4_int_l,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl) {
+  const int mask_shift = plane_type ? 4 : 8;
+  const int mask_cutoff = plane_type ? 0xf : 0xff;
+  const int lfl_forward = plane_type ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
   unsigned int mask;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = *p_lfi;
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+      mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+      mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
-      if (mask_16x16 & 1) {
-        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-        assert(!(mask_8x8 & 1));
-        assert(!(mask_4x4 & 1));
-        assert(!(mask_4x4_int & 1));
-      } else if (mask_8x8 & 1) {
-        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_4x4 & 1));
-      } else if (mask_4x4 & 1) {
-        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_8x8 & 1));
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                   lfi0->hev_thr);
+        } else if (mask_16x16_0 & 1) {
+          vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                              lfi0->hev_thr);
+        } else {
+          vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                              lfi1->lim, lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_8x8_0 & 1) {
+          vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_int_0 & 1) {
+          vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                             lfi0->hev_thr, 1);
+        } else {
+          vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
       }
     }
-    if (mask_4x4_int & 1)
-      vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 1);
+
     s += 8;
-    p_lfi++;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
   }
 }
 
@@ -386,49 +397,90 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     int only_4x4_1,
-                                     const loop_filter_thresh **p_lfi) {
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = *p_lfi;
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
 
     count = 1;
     if (mask & 1) {
-      if (!only_4x4_1) {
-        if (mask_16x16 & 1) {
-          if ((mask_16x16 & 3) == 3) {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 2);
-            count = 2;
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
+          count = 2;
+        } else {
+          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
           } else {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 1);
+            if (mask_4x4_int & 1)
+              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
           }
-          assert(!(mask_8x8 & 1));
-          assert(!(mask_4x4 & 1));
-          assert(!(mask_4x4_int & 1));
-        } else if (mask_8x8 & 1) {
-          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_4x4 & 1));
-        } else if (mask_4x4 & 1) {
-          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                          lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_8x8 & 1));
+          count = 2;
+        } else {
+          vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
         }
-      }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
-      if (mask_4x4_int & 1)
-        vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1);
+          if (mask_4x4_int & 1)
+            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4_int & 1) {
+        vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr, 1);
+      }
     }
     s += 8 * count;
-    p_lfi += count;
+    lfl += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -447,24 +499,31 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
                         const MODE_INFO *mi, const int shift_y,
                         const int shift_uv,
                         LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
-  const int skip = mi->mbmi.skip_coeff;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
-  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
-  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
-  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
 
   // If filter level is 0 we don't loop filter.
-  if (!filter_level)
+  if (!filter_level) {
     return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   // These set 1 in the current block size for the block size edges.
   // For instance if the block size is 32x16,   we'll set :
@@ -485,7 +544,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if (skip && ref > INTRA_FRAME)
+  if (mbmi->skip && is_inter_block(mbmi))
     return;
 
   // Here we are adding a mask for the transform size.  The transform
@@ -506,12 +565,11 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   // boundaries.  These differ from the 4x4 boundaries on the outside edge of
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
-  if (tx_size_y == TX_4X4) {
+  if (tx_size_y == TX_4X4)
     *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
-  if (tx_size_uv == TX_4X4) {
+
+  if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
-  }
 }
 
 // This function does the same thing as the one above with the exception that
@@ -520,24 +578,31 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
                          LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const int skip = mi->mbmi.skip_coeff;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
-
-  if (!filter_level)
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
     return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if (skip && ref > INTRA_FRAME)
+  if (mbmi->skip && is_inter_block(mbmi))
     return;
 
   *above_y |= (size_mask[block_size] &
@@ -546,21 +611,20 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
   *left_y |= (size_mask[block_size] &
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
-  if (tx_size_y == TX_4X4) {
+  if (tx_size_y == TX_4X4)
     *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
-static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                       MODE_INFO **mi_8x8, const int mode_info_stride,
-                       LOOP_FILTER_MASK *lfm) {
+void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi_8x8;
-  MODE_INFO **mip2 = mi_8x8;
+  MODE_INFO **mip = mi;
+  MODE_INFO **mip2 = mi;
 
   // These are offsets to the next mi in the 64x64 block. It is what gets
   // added to the mi ptr as we go through each loop.  It helps us to avoids
@@ -784,8 +848,59 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       lfm->left_uv[i] &= 0xeeee;
     }
   }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
-#if CONFIG_NON420
+
+static void filter_selectively_vert(uint8_t *s, int pitch,
+                                    unsigned int mask_16x16,
+                                    unsigned int mask_8x8,
+                                    unsigned int mask_4x4,
+                                    unsigned int mask_4x4_int,
+                                    const loop_filter_info_n *lfi_n,
+                                    const uint8_t *lfl) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      } else if (mask_8x8 & 1) {
+        vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      } else if (mask_4x4 & 1) {
+        vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       MODE_INFO **mi_8x8,
@@ -794,14 +909,14 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_x;
   const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
+  const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -813,15 +928,15 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      const int skip_this = mi[0].mbmi.skip_coeff
-                            && is_inter_block(&mi[0].mbmi);
+      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
+      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
       // left edge of current unit is block/partition edge -> no skip
-      const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ?
-          !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
+          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
-      const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ?
-          !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
+          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
       const int skip_this_r = skip_this && !block_edge_above;
       const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
                             ? get_uv_tx_size(&mi[0].mbmi)
@@ -830,7 +945,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
+      if (!(lfl[(r << 3) + (c >> ss_x)] =
+            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -887,7 +1003,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
-                            mask_4x4_int[r], lfi[r]);
+                            mask_4x4_int[r],
+                            &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -898,152 +1015,232 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16[r];
+      mask_8x8_r = mask_8x8[r];
+      mask_4x4_r = mask_4x4[r];
+    }
+
     filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16[r],
-                             mask_8x8[r],
-                             mask_4x4[r],
-                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);
+                             mask_16x16_r,
+                             mask_8x8_r,
+                             mask_4x4_r,
+                             mask_4x4_int_r,
+                             &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
   }
 }
-#endif
 
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               MODE_INFO **mi_8x8,
-                               int mi_row, int mi_col,
-                               LOOP_FILTER_MASK *lfm) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
+void vp9_filter_block_plane(VP9_COMMON *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
-  int row_shift = 3 - ss_x;
-  int row_mask = 0xff >> (ss_x << 2);
 
-#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+  if (!plane->plane_type) {
+    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+    uint64_t mask_4x4_int = lfm->int_4x4_y;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    int r_sampled = r >> ss_x;
-
-    // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
+    // Vertical pass: do 2 rows at one time
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
-    }
-    if (!plane->plane_type) {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
-      // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_y[TX_16X16]),
-                              MASK_ROW(lfm->left_y[TX_8X8]),
-                              MASK_ROW(lfm->left_y[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_y),
-                              lfi[r]);
-    } else {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
       // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_uv[TX_16X16]),
-                              MASK_ROW(lfm->left_uv[TX_8X8]),
-                              MASK_ROW(lfm->left_uv[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_uv),
-                              lfi[r]);
+      filter_selectively_vert_row2(plane->plane_type,
+                                   dst->buf, dst->stride,
+                                   mask_16x16_l,
+                                   mask_8x8_l,
+                                   mask_4x4_l,
+                                   mask_4x4_int_l,
+                                   &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 16 * dst->stride;
+      mask_16x16 >>= 16;
+      mask_8x8 >>= 16;
+      mask_4x4 >>= 16;
+      mask_4x4_int >>= 16;
     }
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
-  }
 
-  // Now do horizontal pass
-  dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
-    int r_sampled = r >> ss_x;
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_y[TX_16X16];
+    mask_8x8 = lfm->above_y[TX_8X8];
+    mask_4x4 = lfm->above_y[TX_4X4];
+    mask_4x4_int = lfm->int_4x4_y;
+
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xff;
+        mask_8x8_r = mask_8x8 & 0xff;
+        mask_4x4_r = mask_4x4 & 0xff;
+      }
 
-    if (!plane->plane_type) {
       filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_y[TX_16X16]),
-                               MASK_ROW(lfm->above_y[TX_8X8]),
-                               MASK_ROW(lfm->above_y[TX_4X4]),
-                               MASK_ROW(lfm->int_4x4_y),
-                               mi_row + r == 0, lfi[r]);
-    } else {
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int & 0xff,
+                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+      mask_4x4_int >>= 8;
+    }
+  } else {
+    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+    uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+    // Vertical pass: do 2 rows at one time
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+      if (plane->plane_type == 1) {
+        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+          lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
+                                                       (c << 1)];
+        }
+      }
+
+      {
+        unsigned int mask_16x16_l = mask_16x16 & 0xff;
+        unsigned int mask_8x8_l = mask_8x8 & 0xff;
+        unsigned int mask_4x4_l = mask_4x4 & 0xff;
+        unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+        // Disable filtering on the leftmost column
+        filter_selectively_vert_row2(plane->plane_type,
+                                     dst->buf, dst->stride,
+                                     mask_16x16_l,
+                                     mask_8x8_l,
+                                     mask_4x4_l,
+                                     mask_4x4_int_l,
+                                     &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+        dst->buf += 16 * dst->stride;
+        mask_16x16 >>= 8;
+        mask_8x8 >>= 8;
+        mask_4x4 >>= 8;
+        mask_4x4_int >>= 8;
+      }
+    }
+
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_uv[TX_16X16];
+    mask_8x8 = lfm->above_uv[TX_8X8];
+    mask_4x4 = lfm->above_uv[TX_4X4];
+    mask_4x4_int = lfm->int_4x4_uv;
+
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
+          0 : (mask_4x4_int & 0xf);
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xf;
+        mask_8x8_r = mask_8x8 & 0xf;
+        mask_4x4_r = mask_4x4 & 0xf;
+      }
+
       filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_uv[TX_16X16]),
-                               MASK_ROW(lfm->above_uv[TX_8X8]),
-                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
                                mask_4x4_int_r,
-                               mi_row + r == 0, lfi[r]);
+                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 4;
+      mask_8x8 >>= 4;
+      mask_4x4 >>= 4;
+      mask_4x4_int >>= 4;
     }
-    dst->buf += 8 * dst->stride;
   }
-#undef MASK_ROW
 }
 
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
-                          VP9_COMMON *cm, MACROBLOCKD *xd,
+                          VP9_COMMON *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int mi_row, mi_col;
+  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
+                                 planes[1].subsampling_x == 1);
   LOOP_FILTER_MASK lfm;
-#if CONFIG_NON420
-  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
-      xd->plane[1].subsampling_x == 1);
-#endif
+  int mi_row, mi_col;
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
-      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-#if CONFIG_NON420
       if (use_420)
-#endif
-        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
-                   &lfm);
+        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                       &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
-#if CONFIG_NON420
         if (use_420)
-#endif
-          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
-                             mi_col, &lfm);
-#if CONFIG_NON420
+          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
         else
-          filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
+          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
                                     mi_row, mi_col);
-#endif
       }
     }
   }
 }
 
-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           VP9_COMMON *cm, MACROBLOCKD *xd,
                            int frame_filter_level,
-                           int y_only, int partial) {
+                           int y_only, int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
   if (!frame_filter_level) return;
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
-  if (partial && cm->mi_rows > 8) {
+  if (partial_frame && cm->mi_rows > 8) {
     start_mi_row = cm->mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
     mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   vp9_loop_filter_frame_init(cm, frame_filter_level);
-  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
+  vp9_loop_filter_rows(frame, cm, xd->plane,
                        start_mi_row, end_mi_row,
                        y_only);
 }
@@ -1051,7 +1248,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
 int vp9_loop_filter_worker(void *arg1, void *arg2) {
   LFWorkerData *const lf_data = (LFWorkerData*)arg1;
   (void)arg2;
-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                        lf_data->start, lf_data->stop, lf_data->y_only);
   return 1;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
index 62389ea5e33..6fa2773e594 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
@@ -17,6 +17,10 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_LOOP_FILTER 63
 #define MAX_SHARPNESS 7
 
@@ -54,12 +58,44 @@ typedef struct {
 typedef struct {
   loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
-  uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
+} LOOP_FILTER_MASK;
+
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
+struct VP9LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void vp9_setup_mask(struct VP9Common *const cm,
+                    const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane(struct VP9Common *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm);
 
 void vp9_loop_filter_init(struct VP9Common *cm);
 
@@ -68,27 +104,35 @@ void vp9_loop_filter_init(struct VP9Common *cm);
 // calls this function directly.
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
-void vp9_loop_filter_frame(struct VP9Common *cm,
+void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           struct VP9Common *cm,
                            struct macroblockd *mbd,
                            int filter_level,
-                           int y_only, int partial);
+                           int y_only, int partial_frame);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
-                          struct VP9Common *cm, struct macroblockd *xd,
+                          struct VP9Common *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only);
 
 typedef struct LoopFilterWorkerData {
   const YV12_BUFFER_CONFIG *frame_buffer;
   struct VP9Common *cm;
-  struct macroblockd xd;  // TODO(jzern): most of this is unnecessary to the
-                          // loopfilter. the planes are necessary as their state
-                          // is changed during decode.
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+
   int start;
   int stop;
   int y_only;
+
+  struct VP9LfSyncData *lf_sync;
+  int num_lf_workers;
 } LFWorkerData;
 
 // Operates on the rows described by LFWorkerData passed as 'arg1'.
 int vp9_loop_filter_worker(void *arg1, void *arg2);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c
index 2c4bf6cb237..25d3311b6f6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -70,7 +70,7 @@ static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
   return hev;
 }
 
-static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
@@ -78,6 +78,7 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
   const int8_t ps0 = (int8_t) *op0 ^ 0x80;
   const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
   const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
+  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
 
   // add outer taps if we have high edge variance
   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
@@ -101,11 +102,9 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh,
-                                       int count) {
+void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh, int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -115,17 +114,22 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
 }
 
-void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh,
-                                     int count) {
+void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -135,13 +139,21 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
     const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    filter4(mask, hev, s - 2, s - 1, s, s + 1);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
 }
 
-static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
+void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                  thresh1, 1);
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1,
@@ -158,15 +170,13 @@ static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
   } else {
-    filter4(mask, hev, op1,  op0, oq0, oq1);
+    filter4(mask, thresh, op1,  op0, oq0, oq1);
   }
 }
 
-void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
-                                         const uint8_t *blimit,
-                                         const uint8_t *limit,
-                                         const uint8_t *thresh,
-                                         int count) {
+void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh,
+                            int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -177,19 +187,24 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
 
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                             s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
     ++s;
   }
 }
 
-void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh,
-                                       int count) {
+void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
   int i;
 
   for (i = 0; i < 8 * count; ++i) {
@@ -197,15 +212,23 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
-                             s,     s + 1, s + 2, s + 3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
+                                 s,     s + 1, s + 2, s + 3);
     s += pitch;
   }
 }
 
-static INLINE void filter16(int8_t mask, uint8_t hev,
+void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                    thresh1, 1);
+}
+
+static INLINE void filter16(int8_t mask, uint8_t thresh,
                             uint8_t flat, uint8_t flat2,
                             uint8_t *op7, uint8_t *op6,
                             uint8_t *op5, uint8_t *op4,
@@ -252,15 +275,13 @@ static INLINE void filter16(int8_t mask, uint8_t hev,
     *oq6 = ROUND_POWER_OF_TWO(p0 +
                               q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
   } else {
-    filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
-                                    const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh,
-                                    int count) {
+void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -270,13 +291,12 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat2 = flat_mask5(1,
                              s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
                              q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 
-    filter16(mask, hev, flat, flat2,
+    filter16(mask, *thresh, flat, flat2,
              s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
              s,         s + 1 * p, s + 2 * p, s + 3 * p,
@@ -285,25 +305,35 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
   }
 }
 
-void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
-                                  const uint8_t *blimit,
-                                  const uint8_t *limit,
-                                  const uint8_t *thresh) {
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
   int i;
 
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
                                     q0, s[4], s[5], s[6], s[7]);
 
-    filter16(mask, hev, flat, flat2,
+    filter16(mask, *thresh, flat, flat2,
              s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
              s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
     s += p;
   }
 }
+
+void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+}
+
+void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mv.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mv.h
index 31a79b98402..3eb7f9d612e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mv.h
@@ -15,7 +15,11 @@
 
 #include "vp9/common/vp9_common.h"
 
-typedef struct {
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mv {
   int16_t row;
   int16_t col;
 } MV;
@@ -25,15 +29,19 @@ typedef union int_mv {
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
-typedef struct {
+typedef struct mv32 {
   int32_t row;
   int32_t col;
 } MV32;
 
-static void clamp_mv(MV *mv, int min_col, int max_col,
-                             int min_row, int max_row) {
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
+                            int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
index 8df8aec8484..61682c42d90 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
@@ -13,6 +13,11 @@
 
 #define MVREF_NEIGHBOURS 8
 
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
 typedef enum {
   BOTH_ZERO = 0,
   ZERO_PLUS_PREDICTED = 1,
@@ -71,7 +76,7 @@ static const int counter_to_context[19] = {
   BOTH_INTRA  // 18
 };
 
-static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
   // 4X4
   {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
   // 4X8
@@ -143,28 +148,30 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(MV) \
+#define ADD_MV_REF_LIST(mv) \
   do { \
     if (refmv_count) { \
-      if ((MV).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (MV); \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
         goto Done; \
       } \
     } else { \
-      mv_ref_list[refmv_count++] = (MV); \
+      mv_ref_list[refmv_count++] = (mv); \
     } \
   } while (0)
 
 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
   do { \
-    if ((CANDIDATE)->ref_frame[0] != ref_frame) \
-      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
-    if ((CANDIDATE)->ref_frame[1] != ref_frame && \
-        has_second_ref(CANDIDATE) && \
-        (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \
-      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
   } while (0)
 
 
@@ -172,26 +179,30 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile,
                             int mi_col, int mi_row, int mi_rows,
-                            const MV *mv) {
-  return !(mi_row + mv->row < 0 ||
-           mi_col + mv->col < tile->mi_col_start ||
-           mi_row + mv->row >= mi_rows ||
-           mi_col + mv->col >= tile->mi_col_end);
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col) {
+static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             const TileInfo *const tile,
+                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
+        ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
+        : NULL;
   const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+
+
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+
   int different_ref_found = 0;
   int context_counter = 0;
 
@@ -202,26 +213,19 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // if the size < 8x8 we get the mv from the bmi substructure,
   // and we also need to keep a mode count.
   for (i = 0; i < 2; ++i) {
-    const MV *const mv_ref = &mv_ref_search[i];
+    const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
-                                                   * xd->mode_info_stride];
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       // Keep counts for entropy encoding.
       context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
 
-      // Check if the candidate comes from the same reference frame.
-      if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0,
-                                         mv_ref->col, block_idx));
-        different_ref_found = candidate->ref_frame[1] != ref_frame;
-      } else {
-        if (candidate->ref_frame[1] == ref_frame)
-          // Add second motion vector if it has the same ref_frame.
-          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1,
-                                           mv_ref->col, block_idx));
-        different_ref_found = 1;
-      }
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
     }
   }
 
@@ -229,20 +233,16 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // as before except we don't need to keep track of sub blocks or
   // mode counts.
   for (; i < MVREF_NEIGHBOURS; ++i) {
-    const MV *const mv_ref = &mv_ref_search[i];
+    const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
-                                            mv_ref->row
-                                            * xd->mode_info_stride]->mbmi;
+      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
+                                                    xd->mi_stride]->mbmi;
+      different_ref_found = 1;
 
-      if (candidate->ref_frame[0] == ref_frame) {
+      if (candidate->ref_frame[0] == ref_frame)
         ADD_MV_REF_LIST(candidate->mv[0]);
-        different_ref_found = candidate->ref_frame[1] != ref_frame;
-      } else {
-        if (candidate->ref_frame[1] == ref_frame)
-          ADD_MV_REF_LIST(candidate->mv[1]);
-        different_ref_found = 1;
-      }
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[1]);
     }
   }
 
@@ -259,21 +259,19 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // different reference frames.
   if (different_ref_found) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      const MV *mv_ref = &mv_ref_search[i];
+      const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-        const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
-                                                          mv_ref->row
-                                              * xd->mode_info_stride]->mbmi;
+        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
-        if (is_inter_block(candidate))
-          IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        IF_DIFF_REF_FRAME_ADD_MV(candidate);
       }
     }
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi && is_inter_block(prev_mbmi))
+  if (prev_mbmi)
     IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
 
  Done:
@@ -284,3 +282,84 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
     clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
+
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
+                                    MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                                    int_mv *mv_ref_list,
+                                    int mi_row, int mi_col) {
+  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col);
+}
+
+static void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
+
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest, int_mv *near) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+    clamp_mv2(&mvlist[i].as_mv, xd);
+  }
+  *nearest = mvlist[0];
+  *near = mvlist[1];
+}
+
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest, int_mv *near) {
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
+                   mi_row, mi_col);
+
+  near->as_int = 0;
+  switch (block) {
+    case 0:
+      nearest->as_int = mv_list[0].as_int;
+      near->as_int = mv_list[1].as_int;
+      break;
+    case 1:
+    case 2:
+      nearest->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest->as_int != mv_list[n].as_int) {
+          near->as_int = mv_list[n].as_int;
+          break;
+        }
+      break;
+    case 3: {
+      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = bmi[0].as_mv[ref];
+      candidates[2] = mv_list[0];
+      candidates[3] = mv_list[1];
+
+      nearest->as_int = bmi[2].as_mv[ref].as_int;
+      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest->as_int != candidates[n].as_int) {
+          near->as_int = candidates[n].as_int;
+          break;
+        }
+      break;
+    }
+    default:
+      assert("Invalid block index.");
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
index ce4c55983be..903ac02bb65 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
@@ -7,29 +7,46 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
+#define VP9_COMMON_VP9_MVREF_COMMON_H_
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
 
-#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
-#define VP9_COMMON_VP9_MVREF_COMMON_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
 
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col);
-
-static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
-                                    MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
-                       mv_ref_list, -1, mi_row, mi_col);
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list, int mi_row, int mi_col);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest, int_mv *near);
+
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest, int_mv *near);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyx.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyx.h
deleted file mode 100644
index acb4724e5f7..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyx.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_ONYX_H_
-#define VP9_COMMON_VP9_ONYX_H_
-
-#ifdef __cplusplus
-extern "C"
-{ // NOLINT
-#endif
-
-#include "./vpx_config.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx/vp8cx.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_ppflags.h"
-
-#define MAX_SEGMENTS 8
-
-  typedef int *VP9_PTR;
-
-  /* Create/destroy static data structures. */
-
-  typedef enum {
-    NORMAL      = 0,
-    FOURFIVE    = 1,
-    THREEFIVE   = 2,
-    ONETWO      = 3
-  } VPX_SCALING;
-
-  typedef enum {
-    VP9_LAST_FLAG = 1,
-    VP9_GOLD_FLAG = 2,
-    VP9_ALT_FLAG = 4
-  } VP9_REFFRAME;
-
-
-  typedef enum {
-    USAGE_STREAM_FROM_SERVER    = 0x0,
-    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2,
-    USAGE_CONSTANT_QUALITY      = 0x3,
-  } END_USAGE;
-
-
-  typedef enum {
-    MODE_GOODQUALITY    = 0x1,
-    MODE_BESTQUALITY    = 0x2,
-    MODE_FIRSTPASS      = 0x3,
-    MODE_SECONDPASS     = 0x4,
-    MODE_SECONDPASS_BEST = 0x5,
-  } MODE;
-
-  typedef enum {
-    FRAMEFLAGS_KEY    = 1,
-    FRAMEFLAGS_GOLDEN = 2,
-    FRAMEFLAGS_ALTREF = 4,
-  } FRAMETYPE_FLAGS;
-
-  typedef struct {
-    int version;  // 4 versions of bitstream defined:
-                  //   0 - best quality/slowest decode,
-                  //   3 - lowest quality/fastest decode
-    int width;  // width of data passed to the compressor
-    int height;  // height of data passed to the compressor
-    double framerate;  // set to passed in framerate
-    int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
-
-    int noise_sensitivity;  // pre processing blur: recommendation 0
-    int Sharpness;  // sharpening output: recommendation 0:
-    int cpu_used;
-    unsigned int rc_max_intra_bitrate_pct;
-
-    // mode ->
-    // (0)=Realtime/Live Encoding. This mode is optimized for realtime
-    //     encoding (for example, capturing a television signal or feed from
-    //     a live camera). ( speed setting controls how fast )
-    // (1)=Good Quality Fast Encoding. The encoder balances quality with the
-    //     amount of time it takes to encode the output. ( speed setting
-    //     controls how fast )
-    // (2)=One Pass - Best Quality. The encoder places priority on the
-    //     quality of the output over encoding speed. The output is compressed
-    //     at the highest possible quality. This option takes the longest
-    //     amount of time to encode. ( speed setting ignored )
-    // (3)=Two Pass - First Pass. The encoder generates a file of statistics
-    //     for use in the second encoding pass. ( speed setting controls how
-    //     fast )
-    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were
-    //     generated in the first encoding pass to create the compressed
-    //     output. ( speed setting controls how fast )
-    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that
-    //     were generated in the first encoding pass to create the compressed
-    //     output using the highest possible quality, and taking a
-    //    longer amount of time to encode.. ( speed setting ignored )
-    int Mode;
-
-    // Key Framing Operations
-    int auto_key;  // autodetect cut scenes and set the keyframes
-    int key_freq;  // maximum distance to key frame.
-
-    int allow_lag;  // allow lagged compression (if 0 lagin frames is ignored)
-    int lag_in_frames;  // how many frames lag before we start encoding
-
-    // ----------------------------------------------------------------
-    // DATARATE CONTROL OPTIONS
-
-    int end_usage;  // vbr or cbr
-
-    // buffer targeting aggressiveness
-    int under_shoot_pct;
-    int over_shoot_pct;
-
-    // buffering parameters
-    int64_t starting_buffer_level;  // in seconds
-    int64_t optimal_buffer_level;
-    int64_t maximum_buffer_size;
-
-    // controlling quality
-    int fixed_q;
-    int worst_allowed_q;
-    int best_allowed_q;
-    int cq_level;
-    int lossless;
-
-    // two pass datarate control
-    int two_pass_vbrbias;        // two pass datarate control tweaks
-    int two_pass_vbrmin_section;
-    int two_pass_vbrmax_section;
-    // END DATARATE CONTROL OPTIONS
-    // ----------------------------------------------------------------
-
-    // Spatial scalability
-    int ss_number_layers;
-
-    // these parameters aren't to be used in final build don't use!!!
-    int play_alternate;
-    int alt_freq;
-
-    int encode_breakout;  // early breakout : for video conf recommend 800
-
-    /* Bitfield defining the error resiliency features to enable.
-     * Can provide decodable frames after losses in previous
-     * frames and decodable partitions after losses in the same frame.
-     */
-    unsigned int error_resilient_mode;
-
-    /* Bitfield defining the parallel decoding mode where the
-     * decoding in successive frames may be conducted in parallel
-     * just by decoding the frame headers.
-     */
-    unsigned int frame_parallel_decoding_mode;
-
-    int arnr_max_frames;
-    int arnr_strength;
-    int arnr_type;
-
-    int tile_columns;
-    int tile_rows;
-
-    struct vpx_fixed_buf         two_pass_stats_in;
-    struct vpx_codec_pkt_list  *output_pkt_list;
-
-    vp8e_tuning tuning;
-  } VP9_CONFIG;
-
-
-  void vp9_initialize_enc();
-
-  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
-  void vp9_remove_compressor(VP9_PTR *comp);
-
-  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
-
-  // receive a frames worth of data. caller can assume that a copy of this
-  // frame is made and not just a copy of the pointer..
-  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
-                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                            int64_t end_time_stamp);
-
-  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
-                              unsigned long *size, unsigned char *dest,
-                              int64_t *time_stamp, int64_t *time_end,
-                              int flush);
-
-  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
-                                vp9_ppflags_t *flags);
-
-  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
-
-  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
-
-  int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                             YV12_BUFFER_CONFIG *sd);
-
-  int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
-
-  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                            YV12_BUFFER_CONFIG *sd);
-
-  int vp9_update_entropy(VP9_PTR comp, int update);
-
-  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
-                     unsigned int rows, unsigned int cols,
-                     int delta_q[MAX_SEGMENTS],
-                     int delta_lf[MAX_SEGMENTS],
-                     unsigned int threshold[MAX_SEGMENTS]);
-
-  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
-                         unsigned int rows, unsigned int cols);
-
-  int vp9_set_internal_size(VP9_PTR comp,
-                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-
-  int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
-                           unsigned int height);
-
-  int vp9_switch_layer(VP9_PTR comp, int layer);
-
-  void vp9_set_svc(VP9_PTR comp, int use_svc);
-
-  int vp9_get_quantizer(VP9_PTR c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // VP9_COMMON_VP9_ONYX_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
index ba2e9d87da9..20de434148e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -25,62 +26,42 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 
-#define ALLOWED_REFS_PER_FRAME 3
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REFS_PER_FRAME 3
 
-#define NUM_REF_FRAMES_LOG2 3
-#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2)
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
 // 1 scratch frame for the new frame, 3 for scaled references on the encoder
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
-
-#define NUM_FRAME_CONTEXTS_LOG2 2
-#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
-
-typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
-                                 [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
-  struct tx_probs tx_probs;
-  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
-  nmv_context nmvc;
-} FRAME_CONTEXT;
+#define FRAME_BUFFERS (REF_FRAMES + 4)
 
-typedef struct {
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
-  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
-  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
-  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
-                         [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
-                                [SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
-  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref[REF_CONTEXTS][2][2];
-  unsigned int comp_ref[REF_CONTEXTS][2];
-  struct tx_counts tx;
-  unsigned int mbskip[MBSKIP_CONTEXTS][2];
-  nmv_context_counts mv;
-} FRAME_COUNTS;
+#define FRAME_CONTEXTS_LOG2 2
+#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+
+extern const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES];
 
 
 typedef enum {
-  SINGLE_PREDICTION_ONLY = 0,
-  COMP_PREDICTION_ONLY   = 1,
-  HYBRID_PREDICTION      = 2,
-  NB_PREDICTION_TYPES    = 3,
-} COMPPREDMODE_TYPE;
+  SINGLE_REFERENCE      = 0,
+  COMPOUND_REFERENCE    = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES       = 3,
+} REFERENCE_MODE;
+
+
+typedef struct {
+  int ref_count;
+  vpx_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+} RefCntBuffer;
 
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
@@ -108,17 +89,16 @@ typedef struct VP9Common {
 
   YV12_BUFFER_CONFIG *frame_to_show;
 
-  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */
-  int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
-  // Each frame can reference ALLOWED_REFS_PER_FRAME buffers
-  int active_ref_idx[ALLOWED_REFS_PER_FRAME];
-  struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
-  struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
+  // Each frame can reference REFS_PER_FRAME buffers
+  RefBuffer frame_refs[REFS_PER_FRAME];
+
   int new_fb_idx;
 
   YV12_BUFFER_CONFIG post_proc_buffer;
@@ -128,6 +108,7 @@ typedef struct VP9Common {
 
   int show_frame;
   int last_show_frame;
+  int show_existing_frame;
 
   // Flag signaling that the frame is encoded using only INTRA modes.
   int intra_only;
@@ -139,13 +120,12 @@ typedef struct VP9Common {
   // frame header, 3 reset all contexts.
   int reset_frame_context;
 
-  int frame_flags;
   // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
   // MODE_INFO (8-pixel) units.
   int MBs;
   int mb_rows, mi_rows;
   int mb_cols, mi_cols;
-  int mode_info_stride;
+  int mi_stride;
 
   /* profile settings */
   TX_MODE tx_mode;
@@ -175,7 +155,7 @@ typedef struct VP9Common {
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
-  INTERPOLATION_TYPE mcomp_filter_type;
+  INTERP_FILTER interp_filter;
 
   loop_filter_info_n lf_info;
 
@@ -190,15 +170,18 @@ typedef struct VP9Common {
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
-  COMPPREDMODE_TYPE comp_pred_mode;
+  REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT fc;  /* this frame entropy */
-  FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
+  FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
   unsigned int current_video_frame;
-  int version;
+  BITSTREAM_PROFILE profile;
+
+  // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2
+  BIT_DEPTH bit_depth;
 
 #if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
@@ -207,63 +190,89 @@ typedef struct VP9Common {
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
+  // Flag indicates if prev_mi can be used in coding:
+  //   0: encoder assumes decoder does not have prev_mi
+  //   1: encoder assumes decoder has and uses prev_mi
+  unsigned int coding_use_prev_mi;
+
   int log2_tile_cols, log2_tile_rows;
-} VP9_COMMON;
 
-// ref == 0 => LAST_FRAME
-// ref == 1 => GOLDEN_FRAME
-// ref == 2 => ALTREF_FRAME
-static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
-  return &cm->yv12_fb[cm->active_ref_idx[ref]];
-}
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+
+  PARTITION_CONTEXT *above_seg_context;
+  ENTROPY_CONTEXT *above_context;
+} VP9_COMMON;
 
-static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->yv12_fb[cm->new_fb_idx];
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+  return &cm->frame_bufs[cm->new_fb_idx].buf;
 }
 
-static int get_free_fb(VP9_COMMON *cm) {
+static INLINE int get_free_fb(VP9_COMMON *cm) {
   int i;
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    if (cm->fb_idx_ref_cnt[i] == 0)
+  for (i = 0; i < FRAME_BUFFERS; i++)
+    if (cm->frame_bufs[i].ref_count == 0)
       break;
 
-  assert(i < NUM_YV12_BUFFERS);
-  cm->fb_idx_ref_cnt[i] = 1;
+  assert(i < FRAME_BUFFERS);
+  cm->frame_bufs[i].ref_count = 1;
   return i;
 }
 
-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
-  if (buf[*idx] > 0)
-    buf[*idx]--;
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
 
   *idx = new_idx;
 
-  buf[new_idx]++;
+  bufs[new_idx].ref_count++;
 }
 
-static int mi_cols_aligned_to_sb(int n_mis) {
+static INLINE int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
-static INLINE void set_skip_context(
-    MACROBLOCKD *xd,
-    ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
-    ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16],
-    int mi_row, int mi_col) {
+static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].dqcoeff = xd->dqcoeff[i];
+    xd->above_context[i] = cm->above_context +
+        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
+  }
+
+  xd->above_seg_context = cm->above_seg_context;
+  xd->mi_stride = cm->mi_stride;
+}
+
+static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
+                                                  int ctx) {
+  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+                                     : cm->fc.partition_prob[ctx];
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
   const int left_idx = (mi_row * 2) & 15;
   int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-    pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x);
-    pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y);
+    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
   }
 }
 
-static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
-                           int mi_row, int bh,
-                           int mi_col, int bw,
-                           int mi_rows, int mi_cols) {
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh,
+                                  int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
   xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
@@ -274,73 +283,63 @@ static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
   xd->left_available  = (mi_col > tile->mi_col_start);
 }
 
-static void set_prev_mi(VP9_COMMON *cm) {
+static INLINE void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode &&
                                        !cm->intra_only &&
                                        cm->last_show_frame;
   // Special case: set prev_mi to NULL when the previous mode info
   // context cannot be used.
   cm->prev_mi = use_prev_in_find_mv_refs ?
-                  cm->prev_mip + cm->mode_info_stride + 1 : NULL;
+                  cm->prev_mip + cm->mi_stride + 1 : NULL;
 }
 
 static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
   return cm->frame_type == KEY_FRAME || cm->intra_only;
 }
 
-static INLINE void update_partition_context(
-    PARTITION_CONTEXT *above_seg_context,
-    PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type,
-    BLOCK_SIZE sb_size) {
-  PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
-  PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
-
-  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  const int bwl = b_width_log2(sb_type);
-  const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
-  const char pcval0 = ~(0xe << boffset);
-  const char pcval1 = ~(0xf << boffset);
-  const char pcvalue[2] = {pcval0, pcval1};
-
-  assert(MAX(bwl, bhl) <= bsl);
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            int mi_row, int mi_col,
+                                            BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  vpx_memset(above_ctx, pcvalue[bwl == bsl], bs);
-  vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
+  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
-static INLINE int partition_plane_context(
-    const PARTITION_CONTEXT *above_seg_context,
-    const PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type) {
-  const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+static INLINE int partition_plane_context(const MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
 
-  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  const int bsl = mi_width_log2(bsize);
+  const int bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
 
-  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+  assert(b_width_log2(bsize) == b_height_log2(bsize));
   assert(bsl >= 0);
-  assert(boffset >= 0);
-
-  for (i = 0; i < bs; i++)
-    above |= (above_ctx[i] & (1 << boffset));
-  for (i = 0; i < bs; i++)
-    left |= (left_ctx[i] & (1 << boffset));
 
-  above = (above > 0);
-  left  = (left > 0);
+  for (i = 0; i < bs; i++) {
+    above |= above_ctx[i];
+    left |= left_ctx[i];
+  }
+  above = (above & bs) > 0;
+  left  = (left & bs) > 0;
 
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
index 212a28ab976..9f3210479ea 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
@@ -13,69 +13,18 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
 
-#define RGB_TO_YUV(t)                                            \
-  ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
-    (0.098*(float)(t & 0xff)) + 16),                             \
-  (-(0.148*(float)(t >> 16))  - (0.291*(float)(t >> 8 & 0xff)) + \
-    (0.439*(float)(t & 0xff)) + 128),                            \
-  ( (0.439*(float)(t >> 16))  - (0.368*(float)(t >> 8 & 0xff)) - \
-    (0.071*(float)(t & 0xff)) + 128)
-
-/* global constants */
-#if 0 && CONFIG_POSTPROC_VISUALIZER
-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
-  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
-  { RGB_TO_YUV(0x00FF00) },   /* Green */
-  { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
-  { RGB_TO_YUV(0x006400) },   /* DarkGreen */
-  { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
-  { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
-  { RGB_TO_YUV(0x00008B) },   /* Dark blue */
-  { RGB_TO_YUV(0x551A8B) },   /* Purple */
-  { RGB_TO_YUV(0xFF0000) }    /* Red */
-  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = {
-  { RGB_TO_YUV(0x6633ff) },   /* Purple */
-  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
-  { RGB_TO_YUV(0xff33cc) },   /* Pink */
-  { RGB_TO_YUV(0xff3366) },   /* Coral */
-  { RGB_TO_YUV(0x3366ff) },   /* Blue */
-  { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
-  { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
-  { RGB_TO_YUV(0xff6633) },   /* Orange */
-  { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
-  { RGB_TO_YUV(0x8ab800) },   /* Green */
-  { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
-  { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
-  { RGB_TO_YUV(0x66ff33) },   /* Light Green */
-  { RGB_TO_YUV(0xccff33) },   /* Yellow */
-};
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
 
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
-  { RGB_TO_YUV(0x00ff00) },   /* Blue */
-  { RGB_TO_YUV(0x0000ff) },   /* Green */
-  { RGB_TO_YUV(0xffff00) },   /* Yellow */
-  { RGB_TO_YUV(0xff0000) },   /* Red */
-};
-#endif
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_textblit.h"
 
+#if CONFIG_VP9_POSTPROC
 static const short kernel5[] = {
   1, 1, 4, 1, 1
 };
@@ -127,9 +76,6 @@ const short vp9_rv[] = {
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
-
-/****************************************************************************
- */
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -371,7 +317,7 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
   }
 }
 
-double vp9_gaussian(double sigma, double mu, double x) {
+static double gaussian(double sigma, double mu, double x) {
   return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
          (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
@@ -396,7 +342,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
       if (a) {
         for (j = 0; j < a; j++) {
@@ -425,27 +371,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
   state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start  starting address of buffer to
- *                                        add gaussian noise to
- *                  unsigned int width    width of plane
- *                  unsigned int height   height of plane
- *                  int  pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
 void vp9_plane_add_noise_c(uint8_t *start, char *noise,
                            char blackclamp[16],
                            char whiteclamp[16],
@@ -469,540 +394,45 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise,
   }
 }
 
-/* Blend the macro block with a solid colored square.  Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v,
-                          int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  y += 2 * stride + 2;
-  for (i = 0; i < 12; i++) {
-    for (j = 0; j < 12; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  u += stride + 1;
-  v += stride + 1;
-
-  for (i = 0; i < 6; i++) {
-    for (j = 0; j < 6; j++) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v,
-                          int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 16; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  for (i = 0; i < 12; i++) {
-    y[0]  = (y[0] * alpha  + y1_const) >> 16;
-    y[1]  = (y[1] * alpha  + y1_const) >> 16;
-    y[14] = (y[14] * alpha + y1_const) >> 16;
-    y[15] = (y[15] * alpha + y1_const) >> 16;
-    y += stride;
-  }
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 16; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (j = 0; j < 8; j++) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-  u += stride;
-  v += stride;
-
-  for (i = 0; i < 6; i++) {
-    u[0] = (u[0] * alpha + u1_const) >> 16;
-    v[0] = (v[0] * alpha + v1_const) >> 16;
-
-    u[7] = (u[7] * alpha + u1_const) >> 16;
-    v[7] = (v[7] * alpha + v1_const) >> 16;
-
-    u += stride;
-    v += stride;
-  }
-
-  for (j = 0; j < 8; j++) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-}
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v,
-                   int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-static void constrain_line(int x0, int *x1, int y0, int *y1,
-                           int width, int height) {
-  int dx;
-  int dy;
-
-  if (*x1 > width) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *x1 = width;
-    if (dx)
-      *y1 = ((width - x0) * dy) / dx + y0;
-  }
-  if (*x1 < 0) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *x1 = 0;
-    if (dx)
-      *y1 = ((0 - x0) * dy) / dx + y0;
-  }
-  if (*y1 > height) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *y1 = height;
-    if (dy)
-      *x1 = ((height - y0) * dx) / dy + x0;
-  }
-  if (*y1 < 0) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *y1 = 0;
-    if (dy)
-      *x1 = ((0 - y0) * dx) / dy + x0;
-  }
-}
-
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  int q = cm->lf.filter_level * 10 / 6;
-  int flags = ppflags->post_proc_flag;
-  int deblock_level = ppflags->deblocking_level;
-  int noise_level = ppflags->noise_level;
+  const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+  const int flags = ppflags->post_proc_flag;
+  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+  struct postproc_state *const ppstate = &cm->postproc_state;
 
   if (!cm->frame_to_show)
     return -1;
 
-  if (q > 63)
-    q = 63;
-
   if (!flags) {
     *dest = *cm->frame_to_show;
     return 0;
   }
 
-#if ARCH_X86||ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
+  vp9_clear_system_state();
 
   if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
-                               q + (deblock_level - 5) * 10, 1, 0);
+    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, ppbuf, q);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   if (flags & VP9D_ADDNOISE) {
-    if (cm->postproc_state.last_q != q
-        || cm->postproc_state.last_noise != noise_level) {
-      fillrd(&cm->postproc_state, 63 - q, noise_level);
-    }
-
-    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
-                        cm->postproc_state.noise,
-                        cm->postproc_state.blackclamp,
-                        cm->postproc_state.whiteclamp,
-                        cm->postproc_state.bothclamp,
-                        cm->post_proc_buffer.y_width,
-                        cm->post_proc_buffer.y_height,
-                        cm->post_proc_buffer.y_stride);
-  }
-
-#if 0 && CONFIG_POSTPROC_VISUALIZER
-  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
-    char message[512];
-    snprintf(message, sizeof(message) -1,
-             "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-             (cm->frame_type == KEY_FRAME),
-             cm->refresh_golden_frame,
-             cm->base_qindex,
-             cm->filter_level,
-             flags,
-             cm->mb_cols, cm->mb_rows);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
-  }
-
-  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
-    int i, j;
-    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = cm->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp9_filter each macro block */
-    for (i = 0; i < mb_rows; i++) {
-      for (j = 0; j < mb_cols; j++) {
-        char zz[4];
-
-        snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a');
-
-        vp9_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride  * 16 - post->y_width;
-    }
-  }
-
-  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
-    int i, j;
-    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = cm->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp9_filter each macro block */
-    for (i = 0; i < mb_rows; i++) {
-      for (j = 0; j < mb_cols; j++) {
-        char zz[4];
-        int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
-                        mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.skip_coeff);
-
-        if (cm->frame_type == KEY_FRAME)
-          snprintf(zz, sizeof(zz) - 1, "a");
-        else
-          snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0');
-
-        vp9_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride  * 16 - post->y_width;
+    const int noise_level = ppflags->noise_level;
+    if (ppstate->last_q != q ||
+        ppstate->last_noise != noise_level) {
+      fillrd(ppstate, 63 - q, noise_level);
     }
-  }
 
-  if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
-    char message[512];
-    snprintf(message, sizeof(message),
-             "Bitrate: %10.2f framerate: %10.2f ",
-             cm->bitrate, cm->framerate);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+                        ppstate->whiteclamp, ppstate->bothclamp,
+                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
 
-  /* Draw motion vectors */
-  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
-    int x0, y0;
-
-    for (y0 = 0; y0 < height; y0 += 16) {
-      for (x0 = 0; x0 < width; x0 += 16) {
-        int x1, y1;
-
-        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
-          mi++;
-          continue;
-        }
-
-        if (mi->mbmi.mode == SPLITMV) {
-          switch (mi->mbmi.partitioning) {
-            case PARTITIONING_16X8 : {  /* mv_top_bottom */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 8,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 8,  x1, y0 + 12,  y1, y_buffer, y_stride);
-
-              break;
-            }
-            case PARTITIONING_8X16 : {  /* mv_left_right */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 8,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 8,  y1, y_buffer, y_stride);
-
-              break;
-            }
-            case PARTITIONING_8X8 : {  /* mv_quarters   */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 12,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[10];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);
-              break;
-            }
-            case PARTITIONING_4X4:
-            default : {
-              union b_mode_info *bmi = mi->bmi;
-              int bx0, by0;
-
-              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
-                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
-                  MV *mv = &bmi->mv.as_mv;
-
-                  x1 = bx0 + 2 + (mv->col >> 3);
-                  y1 = by0 + 2 + (mv->row >> 3);
-
-                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
-                  vp9_blit_line(bx0 + 2,  x1, by0 + 2,  y1, y_buffer, y_stride);
-
-                  bmi++;
-                }
-              }
-            }
-          }
-        } else if (is_inter_mode(mi->mbmi.mode)) {
-          MV *mv = &mi->mbmi.mv.as_mv;
-          const int lx0 = x0 + 8;
-          const int ly0 = y0 + 8;
-
-          x1 = lx0 + (mv->col >> 3);
-          y1 = ly0 + (mv->row >> 3);
-
-          if (x1 != lx0 && y1 != ly0) {
-            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
-            vp9_blit_line(lx0,  x1, ly0 - 1,  y1, y_buffer, y_stride);
-
-            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
-            vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);
-          } else {
-            vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);
-          }
-        }
-
-        mi++;
-      }
-      mi++;
-    }
-  }
-
-  /* Color in block modes */
-  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
-      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (mi->mbmi.mode == I4X4_PRED &&
-            ((ppflags->display_mb_modes_flag & I4X4_PRED) ||
-             ppflags->display_b_modes_flag)) {
-          int by, bx;
-          uint8_t *yl, *ul, *vl;
-          union b_mode_info *bmi = mi->bmi;
-
-          yl = y_ptr + x;
-          ul = u_ptr + (x >> 1);
-          vl = v_ptr + (x >> 1);
-
-          for (by = 0; by < 16; by += 4) {
-            for (bx = 0; bx < 16; bx += 4) {
-              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
-                  || (ppflags->display_mb_modes_flag & I4X4_PRED)) {
-                Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
-                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
-                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
-
-                vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
-                    0xc000, y_stride);
-              }
-              bmi++;
-            }
-
-            yl += y_stride * 4;
-            ul += y_stride * 1;
-            vl += y_stride * 1;
-          }
-        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
-          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
-          vp9_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1),
-                             Y, U, V, 0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-
-  /* Color in frame reference blocks */
-  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
-      ppflags->display_ref_frame_flag) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
-          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
-          vp9_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1),
-                             Y, U, V, 0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-#endif
-
-  *dest = cm->post_proc_buffer;
+  *dest = *ppbuf;
 
   /* handle problem with extending borders */
   dest->y_width = cm->width;
@@ -1012,3 +442,4 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
   return 0;
 }
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
index c63beae9dba..ebebc1ae346 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
@@ -13,6 +13,12 @@
 #define VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct postproc_state {
   int last_q;
@@ -23,8 +29,7 @@ struct postproc_state {
   DECLARE_ALIGNED(16, char, bothclamp[16]);
 };
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_ppflags.h"
+struct VP9Common;
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
@@ -33,4 +38,8 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
 void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
index 561c93028a4..1644a1bbbe8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_PPFLAGS_H_
 #define VP9_COMMON_VP9_PPFLAGS_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
   VP9D_NOFILTERING            = 0,
   VP9D_DEBLOCK                = 1 << 0,
@@ -29,10 +33,10 @@ typedef struct {
   int post_proc_flag;
   int deblocking_level;
   int noise_level;
-  int display_ref_frame_flag;
-  int display_mb_modes_flag;
-  int display_b_modes_flag;
-  int display_mv_flag;
 } vp9_ppflags_t;
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PPFLAGS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pragmas.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pragmas.h
index f079161d6b4..0efc713caaf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pragmas.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pragmas.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_PRAGMAS_H_
 #define VP9_COMMON_VP9_PRAGMAS_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef __INTEL_COMPILER
 #pragma warning(disable:997 1011 170)
 #endif
@@ -19,4 +23,8 @@
 #pragma warning(disable:4799)
 #endif
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PRAGMAS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
index 57ca5c5da32..bc9d6ef5e84 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
@@ -14,134 +14,110 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_treecoder.h"
 
-static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) {
-  return (above != NULL) ? &above->mbmi : NULL;
-}
-
-static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) {
-  return (left != NULL) ? &left->mbmi : NULL;
+static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) {
+  return (mi != NULL) ? &mi->mbmi : NULL;
 }
 
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  // left
-  const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
-                                         : 0;
-  const int left_interp = left_in_image && left_mv_pred
-                              ? left_mi->mbmi.interp_filter
-                              : SWITCHABLE_FILTERS;
-
-  // above
-  const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
-                                           : 0;
-  const int above_interp = above_in_image && above_mv_pred
-                               ? above_mi->mbmi.interp_filter
-                               : SWITCHABLE_FILTERS;
-
-  if (left_interp == above_interp)
-    return left_interp;
-  else if (left_interp == SWITCHABLE_FILTERS &&
-           above_interp != SWITCHABLE_FILTERS)
-    return above_interp;
-  else if (left_interp != SWITCHABLE_FILTERS &&
-           above_interp == SWITCHABLE_FILTERS)
-    return left_interp;
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ?
+                           left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ?
+                             above_mbmi->interp_filter : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
   else
     return SWITCHABLE_FILTERS;
 }
-// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
 
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  // 0 - inter/inter, inter/--, --/inter, --/--
-  // 1 - intra/inter, inter/intra
-  // 2 - intra/--, --/intra
-  // 3 - intra/intra
-  if (above_in_image && left_in_image)  // both edges available
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
     return left_intra && above_intra ? 3
                                      : left_intra || above_intra;
-  else if (above_in_image || left_in_image)  // one edge available
-    return 2 * (above_in_image ? above_intra : left_intra);
-  else
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+  } else {
     return 0;
+  }
 }
-// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
+
+int vp9_get_reference_mode_context(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
     if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
       // neither edge uses comp pred (0/1)
-      pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
-                     (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
+      ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
+            (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
     else if (!has_second_ref(above_mbmi))
       // one of two edges uses comp pred (2/3)
-      pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(above_mbmi));
+      ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(above_mbmi));
     else if (!has_second_ref(left_mbmi))
       // one of two edges uses comp pred (2/3)
-      pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(left_mbmi));
+      ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(left_mbmi));
     else  // both edges use comp pred (4)
-      pred_context = 4;
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!has_second_ref(edge_mbmi))
       // edge does not use comp pred (0/1)
-      pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+      ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
     else
       // edge uses comp pred (3)
-      pred_context = 3;
+      ctx = 3;
   } else {  // no edges available (1)
-    pred_context = 1;
+    ctx = 1;
   }
-  assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
-  return pred_context;
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
 }
 
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
-                                              const MACROBLOCKD *xd) {
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int above_in_image = above_mbmi != NULL;
+  const int left_in_image = left_mbmi != NULL;
+
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -150,6 +126,9 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
   const int var_ref_idx = !fix_ref_idx;
 
   if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
     if (above_intra && left_intra) {  // intra/intra (2)
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter
@@ -163,10 +142,10 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
     } else {  // inter/inter
       const int l_sg = !has_second_ref(left_mbmi);
       const int a_sg = !has_second_ref(above_mbmi);
-      MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
-                                     : above_mbmi->ref_frame[var_ref_idx];
-      MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
-                                     : left_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
 
       if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
         pred_context = 0;
@@ -179,8 +158,8 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
         else
           pred_context = 1;
       } else if (l_sg || a_sg) {  // single/comp
-        MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
-        MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
         if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
           pred_context = 1;
         else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
@@ -212,21 +191,21 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 
   return pred_context;
 }
-unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
@@ -237,30 +216,31 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
         pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST_FRAME);
     } else {  // inter/inter
-      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
-      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
-        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                            above_mbmi->ref_frame[1] == LAST_FRAME ||
-                            left_mbmi->ref_frame[0] == LAST_FRAME ||
-                            left_mbmi->ref_frame[1] == LAST_FRAME);
-      } else {
-        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == LAST_FRAME)
           pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
-          pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
       }
     }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
     if (!is_inter_block(edge_mbmi)) {  // intra
       pred_context = 2;
     } else {  // inter
@@ -278,22 +258,21 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   return pred_context;
 }
 
-unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
 
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
@@ -308,36 +287,25 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
                                 edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
       }
     } else {  // inter/inter
-      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
-        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-            left_mbmi->ref_frame[0] == LAST_FRAME) {
-          pred_context = 3;
-        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                   left_mbmi->ref_frame[0] == LAST_FRAME) {
-          const MB_MODE_INFO *edge_mbmi =
-              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;
-
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-        } else {
-          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
-        }
-      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
-        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
         else
           pred_context = 2;
-      } else {
-        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
@@ -345,10 +313,21 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
           pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0
+                                                                  : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                             2 * (left0 == GOLDEN_FRAME);
+        }
       }
     }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!is_inter_block(edge_mbmi) ||
         (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
@@ -368,43 +347,30 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
-unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
-  int above_context = max_tx_size;
-  int left_context = max_tx_size;
-
-  if (above_in_image)
-    above_context = above_mbmi->skip_coeff ? max_tx_size
-                                           : above_mbmi->tx_size;
-
-  if (left_in_image)
-    left_context = left_mbmi->skip_coeff ? max_tx_size
-                                         : left_mbmi->tx_size;
-
-  if (!left_in_image)
-    left_context = above_context;
-
-  if (!above_in_image)
-    above_context = left_context;
-
-  return above_context + left_context > max_tx_size;
-}
-
-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
-  xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
+int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+  int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size
+                                                   : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size
+                                                : max_tx_size;
+  if (!has_left)
+    left_ctx = above_ctx;
+
+  if (!has_above)
+    above_ctx = left_ctx;
+
+  return (above_ctx + left_ctx) > max_tx_size;
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y, segment_id = INT_MAX;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
index 19032bf628f..1a7ba86e47c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
@@ -14,12 +14,16 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
-  return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL;
+  return xd->up_available ? xd->mi[-xd->mi_stride] : NULL;
 }
 
 static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
-  return xd->left_available ? xd->mi_8x8[-1] : NULL;
+  return xd->left_available ? xd->mi[-1] : NULL;
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
@@ -35,55 +39,42 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
 
-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
-
-static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
+static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = get_above_mi(xd);
   const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_skip_coeff = (above_mi != NULL) ?
-                               above_mi->mbmi.skip_coeff : 0;
-  const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
-
-  return above_skip_coeff + left_skip_coeff;
-}
-
-static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
-                                                const MACROBLOCKD *xd) {
-  return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)];
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
+  return above_skip + left_skip;
 }
 
-static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->mi_8x8[0]->mbmi.skip_coeff;
+static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
 }
 
-unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
-unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_intra_inter(xd);
-  return cm->fc.intra_inter_prob[pred_context];
+static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
-unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
-                                                    const MACROBLOCKD *xd);
-
+int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-static INLINE
-vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
-                                            const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
-  return cm->fc.comp_inter_prob[pred_context];
+static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+                                                   const MACROBLOCKD *xd) {
+  return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
 
-unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
-                                              const MACROBLOCKD *xd);
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
@@ -91,50 +82,60 @@ static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
   return cm->fc.comp_ref_prob[pred_context];
 }
 
-unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_single_ref_p1(xd);
-  return cm->fc.single_ref_prob[pred_context][0];
+  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
-unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_single_ref_p2(xd);
-  return cm->fc.single_ref_prob[pred_context][1];
+  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
-unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
-
-static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
-                                    const struct tx_probs *tx_probs) {
-  if (bsize < BLOCK_16X16)
-    return tx_probs->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_probs->p16x16[context];
-  else
-    return tx_probs->p32x32[context];
+int vp9_get_tx_size_context(const MACROBLOCKD *xd);
+
+static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+                                           const struct tx_probs *tx_probs) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_probs->p8x8[ctx];
+    case TX_16X16:
+      return tx_probs->p16x16[ctx];
+    case TX_32X32:
+      return tx_probs->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
-static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs,
-                                     const MODE_INFO *m) {
-  const BLOCK_SIZE bsize = m->mbmi.sb_type;
-  const int context = vp9_get_pred_context_tx_size(xd);
-  return get_tx_probs(bsize, context, tx_probs);
+static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+                                            const MACROBLOCKD *xd,
+                                            const struct tx_probs *tx_probs) {
+  return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
 }
 
-static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
-                                   struct tx_counts *tx_counts) {
-  if (bsize < BLOCK_16X16)
-    return tx_counts->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_counts->p16x16[context];
-  else
-    return tx_counts->p32x32[context];
+static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
new file mode 100644
index 00000000000..a1befc63e88
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_prob.h"
+
+const uint8_t vp9_norm[256] = {
+  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update);
+  return left_count + right_count;
+}
+
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
+                        max_update_factor, probs);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_treecoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
index 4ba171f4664..f3614803505 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_treecoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,15 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_TREECODER_H_
-#define VP9_COMMON_VP9_TREECODER_H_
+#ifndef VP9_COMMON_VP9_PROB_H_
+#define VP9_COMMON_VP9_PROB_H_
 
 #include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
+
 #include "vp9/common/vp9_common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef uint8_t vp9_prob;
 
+#define MAX_PROB 255
+
 #define vp9_prob_half ((vp9_prob) 128)
 
 typedef int8_t vp9_tree_index;
@@ -34,33 +43,12 @@ typedef int8_t vp9_tree_index;
 
 typedef const vp9_tree_index vp9_tree[];
 
-struct vp9_token {
-  int value;
-  int len;
-};
-
-/* Construct encoding array from tree. */
-
-void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
-
-/* Convert array of token occurrence counts into a table of probabilities
-   for the associated binary encoding tree.  Also writes count of branches
-   taken for each node on the tree; this facilitiates decisions as to
-   probability updates. */
-
-void vp9_tree_probs_from_distribution(vp9_tree tree,
-                                      vp9_prob probs[ /* n - 1 */ ],
-                                      unsigned int branch_ct[ /* n - 1 */ ][2],
-                                      const unsigned int num_events[ /* n */ ],
-                                      unsigned int tok0_offset);
-
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
 
 // int64 is not needed for normal frame level calculations.
-// However when outputing entropy stats accumulated over many frames
+// However when outputting entropy stats accumulated over many frames
 // or even clips we can overflow int math.
 #ifdef ENTROPY_STATS
 static INLINE vp9_prob get_prob(int num, int den) {
@@ -76,27 +64,30 @@ static INLINE vp9_prob get_binary_prob(int n0, int n1) {
   return get_prob(n0, n0 + n1);
 }
 
-/* this function assumes prob1 and prob2 are already within [1,255] range */
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
 static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
+  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
-                     max_update_factor);
-}
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs);
+
 
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-#endif  // VP9_COMMON_VP9_TREECODER_H_
+#endif  // VP9_COMMON_VP9_PROB_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c
index 6dbdb421623..def12554dfc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c
@@ -130,12 +130,13 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }
 
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    return seg->abs_delta == SEGMENT_ABSDATA ?
-                             data :  // Abs value
-                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.h
index 83f2fb65530..58110400645 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.h
@@ -13,6 +13,10 @@
 
 #include "vp9/common/vp9_blockd.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MINQ 0
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
@@ -23,6 +27,11 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
index 1c96788dbb0..edc36d7805e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
@@ -20,59 +20,81 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE mcomp_filter_type,
-                              VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->mi_8x8[0]) {
-    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
-                          mbmi->ref_frame[1] - LAST_FRAME,
-                          cm->active_ref_scale);
-  } else {
-    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
-  }
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
 
-  xd->subpix.filter_x = xd->subpix.filter_y =
-      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
-                               EIGHTTAP : mcomp_filter_type);
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
 
-  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
 }
 
 static void inter_predictor(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
-                            const MV32 *mv,
-                            const struct scale_factors *scale,
+                            const int subpel_x,
+                            const int subpel_y,
+                            const struct scale_factors *sf,
                             int w, int h, int ref,
-                            const struct subpix_fn_table *subpix,
+                            const InterpKernel *kernel,
                             int xs, int ys) {
-  const int subpel_x = mv->col & SUBPEL_MASK;
-  const int subpel_y = mv->row & SUBPEL_MASK;
-
-  src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
-  scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
-      subpix->filter_x[subpel_x], xs,
-      subpix->filter_y[subpel_y], ys,
-      w, h);
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
 }
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *src_mv,
-                               const struct scale_factors *scale,
+                               const struct scale_factors *sf,
                                int w, int h, int ref,
-                               const struct subpix_fn_table *subpix,
-                               enum mv_precision precision) {
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
-  const struct scale_factors_common *sfc = scale->sfc;
-  const MV32 mv = sfc->scale_mv(&mv_q4, scale);
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
-                  w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -117,33 +139,18 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   return clamped_mv;
 }
 
-struct build_inter_predictors_args {
-  MACROBLOCKD *xd;
-  int x, y;
-};
-
-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
-                                   int pred_w, int pred_h,
-                                   void *argv) {
-  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD *const xd = arg->xd;
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
-  const int bw = 4 << bwl;
-  const int bh = plane_block_height(bsize, pd);
-  const int x = 4 * (block & ((1 << bwl) - 1));
-  const int y = 4 * (block >> bwl);
-  const MODE_INFO *mi = xd->mi_8x8[0];
+  const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
   int ref;
 
-  assert(x < bw);
-  assert(y < bh);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
-
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    struct scale_factors *const scale = &xd->scale_factor[ref];
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
@@ -168,49 +175,26 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
 
     uint8_t *pre;
     MV32 scaled_mv;
-    int xs, ys;
-
-    if (vp9_is_scaled(scale->sfc)) {
-      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
-      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-      scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
-      xs = scale->sfc->x_step_q4;
-      ys = scale->sfc->y_step_q4;
+    int xs, ys, subpel_x, subpel_y;
+
+    if (vp9_is_scaled(sf)) {
+      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
     } else {
       pre = pre_buf->buf + (y * pre_buf->stride + x);
       scaled_mv.row = mv_q4.row;
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
     }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+           + (scaled_mv.col >> SUBPEL_BITS);
 
     inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                    &scaled_mv, scale,
-                    4 << pred_w, 4 << pred_h, ref,
-                    &xd->subpix, xs, ys);
-  }
-}
-
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
-    int i = 0, x, y;
-    assert(bsize == BLOCK_8X8);
-    for (y = 0; y < 1 << bhl; ++y)
-      for (x = 0; x < 1 << bwl; ++x)
-        visit(plane, i++, bsize, 0, 0, arg);
-  } else {
-    visit(plane, 0, bsize, bwl, bhl, arg);
+                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
   }
 }
 
@@ -218,12 +202,27 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
                                               int plane_from, int plane_to) {
   int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct build_inter_predictors_args args = {
-      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    };
-    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
-                                     &args);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh,
+                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh,
+                             0, 0, bw, bh, mi_x, mi_y);
+    }
   }
 }
 
@@ -242,22 +241,205 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     MAX_MB_PLANE - 1);
 }
 
-// TODO(dkovalev: find better place for this function)
-void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
-  const int ref = cm->active_ref_idx[i];
-  struct scale_factors *const sf = &cm->active_ref_scale[i];
-  struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
-  if (ref >= NUM_YV12_BUFFERS) {
-    vp9_zero(*sf);
-    vp9_zero(*sfc);
-  } else {
-    YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
-    vp9_setup_scale_factors_for_frame(sf, sfc,
-                                      fb->y_crop_width, fb->y_crop_height,
-                                      cm->width, cm->height);
-
-    if (vp9_is_scaled(sfc))
-      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
+// TODO(jingning): This function serves as a placeholder for decoder prediction
+// using on demand border extension. It should be moved to /decoder/ directory.
+static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                       int bw, int bh,
+                                       int x, int y, int w, int h,
+                                       int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+    // same MV (the average of the 4 luma MVs) but we could do something
+    // smarter for non-4:2:0. Just punt for now, pending the changes to get
+    // rid of SPLITMV mode entirely.
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+                             : mi_mv_pred_q4(mi, ref))
+               : mi->mbmi.mv[ref].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    MV32 scaled_mv;
+    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
+        subpel_x, subpel_y;
+    uint8_t *ref_frame, *buf_ptr;
+    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+
+    // Get reference frame pointer, width and height.
+    if (plane == 0) {
+      frame_width = ref_buf->y_crop_width;
+      frame_height = ref_buf->y_crop_height;
+      ref_frame = ref_buf->y_buffer;
+    } else {
+      frame_width = ref_buf->uv_crop_width;
+      frame_height = ref_buf->uv_crop_height;
+      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
+    }
+
+    if (vp9_is_scaled(sf)) {
+      // Co-ordinate of containing block to pixel precision.
+      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = (x_start + x) << SUBPEL_BITS;
+      y0_16 = (y_start + y) << SUBPEL_BITS;
+
+      // Co-ordinate of current block in reference frame
+      // to 1/16th pixel precision.
+      x0_16 = sf->scale_value_x(x0_16, sf);
+      y0_16 = sf->scale_value_y(y0_16, sf);
+
+      // Map the top left corner of the block into the reference frame.
+      x0 = sf->scale_value_x(x_start + x, sf);
+      y0 = sf->scale_value_y(y_start + y, sf);
+
+      // Scale the MV and incorporate the sub-pixel offset of the block
+      // in the reference frame.
+      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
+    } else {
+      // Co-ordinate of containing block to pixel precision.
+      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = x0 << SUBPEL_BITS;
+      y0_16 = y0 << SUBPEL_BITS;
+
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+    // Calculate the top left corner of the best matching block in the reference frame.
+    x0 += scaled_mv.col >> SUBPEL_BITS;
+    y0 += scaled_mv.row >> SUBPEL_BITS;
+    x0_16 += scaled_mv.col;
+    y0_16 += scaled_mv.row;
+
+    // Get reference block pointer.
+    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
+
+    // Do border extension if there is motion or the
+    // width/height is not a multiple of 8 pixels.
+    if (scaled_mv.col || scaled_mv.row ||
+        (frame_width & 0x7) || (frame_height & 0x7)) {
+      // Get reference block bottom right coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+      int x_pad = 0, y_pad = 0;
+
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+        x0 -= VP9_INTERP_EXTEND - 1;
+        x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
+      }
+
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+        y0 -= VP9_INTERP_EXTEND - 1;
+        y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
+      }
+
+      // Skip border extension if block is inside the frame.
+      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
+          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+        // Extend the border.
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
+                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
+                        frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+      }
+    }
+
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bw, bh,
+                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bw, bh,
+                                 0, 0, bw, bh, mi_x, mi_y);
+    }
   }
 }
 
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                               src->alpha_buffer};
+  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                          src->alpha_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *sf) {
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                 src->alpha_buffer};
+    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                            src->alpha_stride};
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+                       sf, pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
index 2c8a6e4d92d..58c596ee87f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
@@ -14,7 +14,10 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-struct subpix_fn_table;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
@@ -24,80 +27,46 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE filter,
-                              VP9_COMMON *cm);
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
-                               const struct scale_factors *scale,
+                               const struct scale_factors *sf,
                                int w, int h, int do_avg,
-                               const struct subpix_fn_table *subpix,
-                               enum mv_precision precision);
-
-static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                const struct scale_factors *scale) {
-  const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) :
-      x_offset;
-  const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) :
-      y_offset;
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y);
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
+  const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
+  const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
   return y * stride + x;
 }
 
-static void setup_pred_plane(struct buf_2d *dst,
-                             uint8_t *src, int stride,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             int subsampling_x, int subsampling_y) {
+static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    uint8_t *src, int stride,
+                                    int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
   dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
   dst->stride = stride;
 }
 
-// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col
-static void setup_dst_planes(MACROBLOCKD *xd,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                         src->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                    src->alpha_stride};
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblockd_plane *pd = &xd->plane[i];
-    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
-                     pd->subsampling_x, pd->subsampling_y);
-  }
-}
-
-static void setup_pre_planes(MACROBLOCKD *xd, int i,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *sf) {
-  if (src) {
-    int j;
-    uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                           src->alpha_buffer};
-    int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                      src->alpha_stride};
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *pd = &xd->plane[j];
-      setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
-                     mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
-    }
-  }
-}
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col);
 
-static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1,
-                              struct scale_factors sf[MAX_REF_FRAMES]) {
-  xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0];
-  xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0];
-}
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf);
 
-void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
index bd609dcf0d4..403e105908e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
@@ -18,23 +18,22 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
-    DCT_DCT,    // DC
-    ADST_DCT,   // V
-    DCT_ADST,   // H
-    DCT_DCT,    // D45
-    ADST_ADST,  // D135
-    ADST_DCT,   // D117
-    DCT_ADST,   // D153
-    DCT_ADST,   // D207
-    ADST_DCT,   // D63
-    ADST_ADST,  // TM
-    DCT_DCT,    // NEARESTMV
-    DCT_DCT,    // NEARMV
-    DCT_DCT,    // ZEROMV
-    DCT_DCT     // NEWMV
+const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
+  DCT_DCT,    // DC
+  ADST_DCT,   // V
+  DCT_ADST,   // H
+  DCT_DCT,    // D45
+  ADST_ADST,  // D135
+  ADST_DCT,   // D117
+  DCT_ADST,   // D153
+  DCT_ADST,   // D207
+  ADST_DCT,   // D63
+  ADST_ADST,  // TM
 };
 
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
 #define intra_pred_sized(type, size) \
   void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
                                                   ptrdiff_t stride, \
@@ -52,7 +51,7 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
 static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
-
+  (void) above;
   // first column
   for (r = 0; r < bs - 1; ++r)
     dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
@@ -81,6 +80,7 @@ intra_pred_allsizes(d207)
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
+  (void) left;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c)
       dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
@@ -96,6 +96,7 @@ intra_pred_allsizes(d63)
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
+  (void) left;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c)
       dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
@@ -188,6 +189,7 @@ intra_pred_allsizes(d153)
 static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                const uint8_t *above, const uint8_t *left) {
   int r;
+  (void) left;
 
   for (r = 0; r < bs; r++) {
     vpx_memcpy(dst, above, bs);
@@ -199,6 +201,7 @@ intra_pred_allsizes(v)
 static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                const uint8_t *above, const uint8_t *left) {
   int r;
+  (void) above;
 
   for (r = 0; r < bs; r++) {
     vpx_memset(dst, left[r], bs);
@@ -223,6 +226,8 @@ intra_pred_allsizes(tm)
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                     const uint8_t *above, const uint8_t *left) {
   int r;
+  (void) above;
+  (void) left;
 
   for (r = 0; r < bs; r++) {
     vpx_memset(dst, 128, bs);
@@ -235,6 +240,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
+  (void) above;
 
   for (i = 0; i < bs; i++)
     sum += left[i];
@@ -250,6 +256,7 @@ intra_pred_allsizes(dc_left)
 static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                     const uint8_t *above, const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
+  (void) left;
 
   for (i = 0; i < bs; i++)
     sum += above[i];
@@ -313,17 +320,21 @@ static void init_intra_pred_fn_ptrs(void) {
 #undef intra_pred_allsizes
 }
 
-static void build_intra_predictors(const uint8_t *ref, int ref_stride,
-                                   uint8_t *dst, int dst_stride,
-                                   MB_PREDICTION_MODE mode, TX_SIZE tx_size,
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
                                    int up_available, int left_available,
-                                   int right_available) {
+                                   int right_available, int x, int y,
+                                   int plane) {
   int i;
   DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
 
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
@@ -334,26 +345,90 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
 
   once(init_intra_pred_fn_ptrs);
 
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  vpx_memset(left_col, 129, 64);
+
   // left
   if (left_available) {
-    for (i = 0; i < bs; i++)
-      left_col[i] = ref[i * ref_stride - 1];
-  } else {
-    vpx_memset(left_col, 129, bs);
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref[i * ref_stride - 1];
+    }
   }
 
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
   // above
   if (up_available) {
     const uint8_t *above_ref = ref - ref_stride;
-    if (bs == 4 && right_available && left_available) {
-      const_above_row = above_ref;
-    } else {
-      vpx_memcpy(above_row, above_ref, bs);
-      if (bs == 4 && right_available)
-        vpx_memcpy(above_row + bs, above_ref + bs, bs);
-      else
-        vpx_memset(above_row + bs, above_row[bs - 1], bs);
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        }
+      }
       above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs);
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+        else
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        above_row[-1] = left_available ? above_ref[-1] : 129;
+      }
     }
   } else {
     vpx_memset(above_row, 127, bs * 2);
@@ -369,17 +444,20 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
   }
 }
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
-                            TX_SIZE tx_size, int mode,
-                            const uint8_t *ref, int ref_stride,
-                            uint8_t *dst, int dst_stride) {
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane) {
   const int bwl = bwl_in - tx_size;
   const int wmask = (1 << bwl) - 1;
   const int have_top = (block_idx >> bwl) || xd->up_available;
   const int have_left = (block_idx & wmask) || xd->left_available;
   const int have_right = ((block_idx & wmask) != wmask);
+  const int x = aoff * 4;
+  const int y = loff * 4;
 
   assert(bwl >= 0);
-  build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size,
-                         have_top, have_left, have_right);
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right, x, y, plane);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h
index e9d0dbf04cf..d09d2a129c0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h
@@ -14,8 +14,17 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
-                            TX_SIZE tx_size, int mode,
-                            const uint8_t *ref, int ref_stride,
-                            uint8_t *dst, int dst_stride);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
new file mode 100644
index 00000000000..1037bfbc360
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -0,0 +1,780 @@
+sub vp9_common_forward_decls() {
+print <<EOF
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp9_common_forward_decls/;
+
+# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+} else {
+  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
+  $avx_x86inc = $avx2_x86inc = '';
+}
+
+# this variable is for functions that are 64 bit only.
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+} else {
+  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
+  $avx_x86_64 = $avx2_x86_64 = '';
+}
+
+#
+# RECON
+#
+add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
+
+add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_4x4/;
+
+add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_4x4/;
+
+add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
+$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
+
+add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
+$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
+
+add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_4x4/;
+
+add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_4x4/;
+
+add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_4x4/;
+
+add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
+
+add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_8x8/;
+
+add_proto qw/void vp9_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_8x8/;
+
+add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
+$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
+
+add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
+$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
+
+add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_8x8/;
+
+add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_8x8/;
+
+add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_8x8/;
+
+add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
+
+add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_16x16/;
+
+add_proto qw/void vp9_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_16x16/;
+
+add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
+$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
+
+add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
+$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
+
+add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_16x16/;
+
+add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_16x16/;
+
+add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_16x16/;
+
+add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
+$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
+
+add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_32x32/;
+
+add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_32x32/;
+
+add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_32x32/;
+
+add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
+$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
+
+add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
+$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
+
+add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_32x32/;
+
+add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_32x32/;
+
+add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_32x32/;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
+
+add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
+
+add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
+$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
+
+add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
+
+add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
+$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
+
+add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
+
+add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
+$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
+
+add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
+
+#
+# post proc
+#
+if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_down mmx sse2/;
+$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
+
+add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_across_ip sse2/;
+$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
+
+add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
+
+add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+specialize qw/vp9_plane_add_noise mmx sse2/;
+$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+}
+
+add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_mb_inner/;
+
+add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_mb_outer/;
+
+add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_b/;
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
+$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
+
+add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
+$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
+
+add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_neon_asm=vp9_convolve8_neon;
+
+add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
+
+add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
+
+add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
+
+add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
+
+add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
+
+#
+# dct
+#
+add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_256_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+
+add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
+specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+
+add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
+specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+
+add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
+specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+# dct and add
+
+add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_iwht4x4_1_add/;
+
+add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_iwht4x4_16_add/;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+
+
+# variance
+add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance4x8/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
+add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad64x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad64x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x16/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad4x8/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
+
+add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x64x3/;
+
+add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x32x3/;
+
+add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x16x3 sse3 ssse3/;
+
+add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x16x3 sse3/;
+
+add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x8x3 sse3/;
+
+add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x4x3 sse3/;
+
+add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad64x64x8/;
+
+add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad32x32x8/;
+
+add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad16x16x8 sse4/;
+
+add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad16x8x8 sse4/;
+
+add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x16x8 sse4/;
+
+add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x8x8 sse4/;
+
+add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x4x8/;
+
+add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad4x8x8/;
+
+add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad4x4x8 sse4/;
+
+add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x64x4d sse2 avx2/;
+
+add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x64x4d sse2/;
+
+add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x32x4d sse2/;
+
+add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x16x4d sse2/;
+
+add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x32x4d sse2/;
+
+add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x32x4d sse2 avx2/;
+
+add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x16x4d sse2/;
+
+add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x8x4d sse2/;
+
+add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x16x4d sse2/;
+
+add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x8x4d sse2/;
+
+# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x4x4d sse2/;
+
+add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x8x4d sse/;
+
+add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x4x4d sse/;
+
+add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse8x16/;
+
+add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse16x8/;
+
+add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse8x8/;
+
+add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
+specialize qw/vp9_get_mb_ss mmx sse2/;
+# ENCODEMB INVOKE
+
+add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vp9_subtract_block/, "$sse2_x86inc";
+
+add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vp9_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp9_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void vp9_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp9_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+# fdct functions
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht4x4 sse2 avx2/;
+
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht8x8 sse2 avx2/;
+
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht16x16 sse2 avx2/;
+
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+
+add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct4x4 sse2 avx2/;
+
+add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct16x16 sse2 avx2/;
+
+add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+
+#
+# Motion search
+#
+add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/vp9_full_search_sad sse3 sse4_1/;
+$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
+$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
+
+add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_refining_search_sad sse3/;
+$vp9_refining_search_sad_sse3=vp9_refining_search_sadx4;
+
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad sse3/;
+$vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
+
+add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_full_range_search/;
+
+add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/vp9_temporal_filter_apply sse2/;
+
+}
+# end encoder functions
+1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.sh b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.sh
deleted file mode 100755
index 5e049c63ce2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ /dev/null
@@ -1,744 +0,0 @@
-vp9_common_forward_decls() {
-cat <<EOF
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-
-#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls vp9_common_forward_decls
-
-# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
-[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
-
-# this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
-  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
-
-#
-# RECON
-#
-prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_4x4
-
-prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_4x4
-
-prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_4x4 $sse_x86inc
-
-prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc
-
-prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_4x4 $sse_x86inc
-
-prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_4x4
-
-prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_4x4
-
-prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_4x4
-
-prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_8x8
-
-prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_8x8
-
-prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_8x8 $sse_x86inc
-
-prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc
-
-prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_8x8 $sse_x86inc
-
-prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_8x8
-
-prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_8x8
-
-prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_8x8
-
-prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_16x16
-
-prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_16x16
-
-prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_16x16 $sse2_x86inc
-
-prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_16x16 $sse2_x86inc
-
-prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_16x16 $sse2_x86inc
-
-prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_16x16
-
-prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_16x16
-
-prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_16x16
-
-prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
-
-prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_32x32
-
-prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_32x32
-
-prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_32x32
-
-prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_32x32 $sse2_x86inc
-
-prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_32x32 $sse2_x86_64
-
-prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_32x32 $sse2_x86inc
-
-prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_32x32
-
-prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_32x32
-
-prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_32x32
-
-#
-# Loopfilter
-#
-prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2 neon
-
-prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2 neon
-
-prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx neon
-
-prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
-
-prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2 neon
-
-prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx neon
-
-#
-# post proc
-#
-if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then
-prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
-specialize vp9_mbpost_proc_down mmx sse2
-vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
-
-prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit"
-specialize vp9_mbpost_proc_across_ip sse2
-vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
-
-prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
-specialize vp9_post_proc_down_and_across mmx sse2
-vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
-
-prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
-specialize vp9_plane_add_noise mmx sse2
-vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
-fi
-
-prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_mb_inner
-
-prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_mb_outer
-
-prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_b
-
-#
-# Sub Pixel Filters
-#
-prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc neon dspr2
-
-prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc neon dspr2
-
-prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
-
-#
-# dct
-#
-prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon dspr2
-
-prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon dspr2
-
-prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon dspr2
-
-prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2
-
-prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2 dspr2
-
-prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon dspr2
-
-prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon dspr2
-
-prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2 dspr2
-
-# dct and add
-
-prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_iwht4x4_1_add
-
-prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_iwht4x4_16_add
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
-
-
-# variance
-prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x16 $sse2_x86inc
-
-prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x32 $sse2_x86inc
-
-prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x32 $sse2_x86inc
-
-prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x64 $sse2_x86inc
-
-prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32 $sse2_x86inc
-
-prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x64 $sse2_x86inc
-
-prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x8 mmx $sse2_x86inc
-
-prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"
-specialize vp9_get_sse_sum_8x8 sse2
-vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2
-
-prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x4 $sse2_x86inc
-
-prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x8 $sse2_x86inc
-
-prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x4 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8 $sse2_x86inc $ssse3_x86inc
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 $sse_x86inc $ssse3_x86inc
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad64x64 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x64 $sse2_x86inc
-
-prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad64x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x16 $sse2_x86inc
-
-prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x4 $sse2_x86inc
-
-prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad4x8 $sse_x86inc
-
-prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx $sse_x86inc
-
-prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x64_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x64_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x8_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x8_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x4_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x8_avg $sse_x86inc
-
-prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x4_avg $sse_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_h
-
-prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_v
-
-prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_hv
-
-prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_h
-
-prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_v
-
-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_hv
-
-prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x64x3
-
-prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x3
-
-prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x3 sse3 ssse3
-
-prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x3 sse3 ssse3
-
-prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x3 sse3
-
-prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x3 sse3
-
-prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x3 sse3
-
-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad64x64x8
-
-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad32x32x8
-
-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad16x16x8 sse4
-
-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad16x8x8 sse4
-
-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x16x8 sse4
-
-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x8x8 sse4
-
-prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x4x8
-
-prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
-specialize vp9_sad4x8x8
-
-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad4x4x8 sse4
-
-prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x64x4d sse2
-
-prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x64x4d sse2
-
-prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x32x4d sse2
-
-prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x16x4d sse2
-
-prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x32x4d sse2
-
-prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x4d sse2
-
-prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x4d sse2
-
-prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x4d sse2
-
-prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x4d sse2
-
-prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x4d sse2
-
-# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x4x4d sse2
-
-prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x8x4d sse
-
-prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x4d sse
-
-#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-#specialize vp9_sub_pixel_mse16x16 sse2 mmx
-
-prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse8x16
-
-prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x8
-
-prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse8x8
-
-prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_mse64x64
-
-prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_mse32x32
-
-prototype unsigned int vp9_get_mb_ss "const int16_t *"
-specialize vp9_get_mb_ss mmx sse2
-# ENCODEMB INVOKE
-
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
-specialize vp9_block_error $sse2_x86inc
-
-prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
-specialize vp9_subtract_block $sse2_x86inc
-
-prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b $ssse3_x86_64
-
-prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 $ssse3_x86_64
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_8x8 $sse2_x86_64
-
-    prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_16x16 $sse2_x86_64
-fi
-
-# fdct functions
-prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
-
-prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
-
-prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
-
-prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fwht4x4
-
-prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
-
-prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
-
-prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
-
-prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
-
-prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
-
-#
-# Motion search
-#
-prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n"
-specialize vp9_full_search_sad sse3 sse4_1
-vp9_full_search_sad_sse3=vp9_full_search_sadx3
-vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
-
-prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
-specialize vp9_refining_search_sad sse3
-vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
-
-prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
-specialize vp9_diamond_search_sad sse3
-vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
-
-prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
-specialize vp9_temporal_filter_apply sse2
-
-prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
-specialize vp9_yv12_copy_partial_frame
-
-
-fi
-# end encoder functions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_sadmxn.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_sadmxn.h
deleted file mode 100644
index b2dfd63f9b4..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_sadmxn.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SADMXN_H_
-#define VP9_COMMON_VP9_SADMXN_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr,
-                                      int src_stride,
-                                      const uint8_t *ref_ptr,
-                                      int ref_stride,
-                                      int m,
-                                      int n) {
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < n; r++) {
-    for (c = 0; c < m; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-#endif  // VP9_COMMON_VP9_SADMXN_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.c
index 3f0994f80a2..d3405fcdb51 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.c
@@ -12,47 +12,19 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_scale.h"
 
-static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) {
-  return val * sfc->x_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
 }
 
-static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) {
-  return val * sfc->y_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT);
 }
 
-static int unscaled_value(int val, const struct scale_factors_common *sfc) {
-  (void) sfc;
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void) sf;
   return val;
 }
 
-static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
-  const MV32 res = {
-    scaled_y(mv->row, scale->sfc) + scale->y_offset_q4,
-    scaled_x(mv->col, scale->sfc) + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
-  const MV32 res = {
-    mv->row,
-    mv->col
-  };
-  return res;
-}
-
-static void set_offsets_with_scaling(struct scale_factors *scale,
-                                     int row, int col) {
-  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
-  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
-                                        int row, int col) {
-  scale->x_offset_q4 = 0;
-  scale->y_offset_q4 = 0;
-}
-
 static int get_fixed_point_scale_factor(int other_size, int this_size) {
   // Calculate scaling factor once for each reference frame
   // and use fixed point scaling factors in decoding and encoding routines.
@@ -69,31 +41,36 @@ static int check_scale_factors(int other_w, int other_h,
          this_h <= 16 * other_h;
 }
 
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       struct scale_factors_common *scale_comm,
+MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const MV32 res = {
+    scaled_y(mv->row, sf) + y_off_q4,
+    scaled_x(mv->col, sf) + x_off_q4
+  };
+  return res;
+}
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
   if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
-    scale_comm->x_scale_fp = REF_INVALID_SCALE;
-    scale_comm->y_scale_fp = REF_INVALID_SCALE;
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
     return;
   }
 
-  scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale_comm->x_step_q4 = scaled_x(16, scale_comm);
-  scale_comm->y_step_q4 = scaled_y(16, scale_comm);
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = scaled_x(16, sf);
+  sf->y_step_q4 = scaled_y(16, sf);
 
-  if (vp9_is_scaled(scale_comm)) {
-    scale_comm->scale_value_x = scaled_x;
-    scale_comm->scale_value_y = scaled_y;
-    scale_comm->set_scaled_offsets = set_offsets_with_scaling;
-    scale_comm->scale_mv = scaled_mv;
+  if (vp9_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
   } else {
-    scale_comm->scale_value_x = unscaled_value;
-    scale_comm->scale_value_y = unscaled_value;
-    scale_comm->set_scaled_offsets = set_offsets_without_scaling;
-    scale_comm->scale_mv = unscaled_mv;
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
   }
 
   // TODO(agrange): Investigate the best choice of functions to use here
@@ -102,48 +79,44 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-  if (scale_comm->x_step_q4 == 16) {
-    if (scale_comm->y_step_q4 == 16) {
+  if (sf->x_step_q4 == 16) {
+    if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
-      scale_comm->predict[0][0][0] = vp9_convolve_copy;
-      scale_comm->predict[0][0][1] = vp9_convolve_avg;
-      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
-      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
-      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vp9_convolve_copy;
+      sf->predict[0][0][1] = vp9_convolve_avg;
+      sf->predict[0][1][0] = vp9_convolve8_vert;
+      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
+      sf->predict[1][0][0] = vp9_convolve8_horiz;
+      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // No scaling in x direction. Must always scale in the y direction.
-      scale_comm->predict[0][0][0] = vp9_convolve8_vert;
-      scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
-      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale_comm->predict[1][0][0] = vp9_convolve8;
-      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vp9_convolve8_vert;
+      sf->predict[0][0][1] = vp9_convolve8_avg_vert;
+      sf->predict[0][1][0] = vp9_convolve8_vert;
+      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
+      sf->predict[1][0][0] = vp9_convolve8;
+      sf->predict[1][0][1] = vp9_convolve8_avg;
     }
   } else {
-    if (scale_comm->y_step_q4 == 16) {
+    if (sf->y_step_q4 == 16) {
       // No scaling in the y direction. Must always scale in the x direction.
-      scale_comm->predict[0][0][0] = vp9_convolve8_horiz;
-      scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale_comm->predict[0][1][0] = vp9_convolve8;
-      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
-      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
-      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vp9_convolve8_horiz;
+      sf->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][1][0] = vp9_convolve8;
+      sf->predict[0][1][1] = vp9_convolve8_avg;
+      sf->predict[1][0][0] = vp9_convolve8_horiz;
+      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // Must always scale in both directions.
-      scale_comm->predict[0][0][0] = vp9_convolve8;
-      scale_comm->predict[0][0][1] = vp9_convolve8_avg;
-      scale_comm->predict[0][1][0] = vp9_convolve8;
-      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
-      scale_comm->predict[1][0][0] = vp9_convolve8;
-      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vp9_convolve8;
+      sf->predict[0][0][1] = vp9_convolve8_avg;
+      sf->predict[0][1][0] = vp9_convolve8;
+      sf->predict[0][1][1] = vp9_convolve8_avg;
+      sf->predict[1][0][0] = vp9_convolve8;
+      sf->predict[1][0][1] = vp9_convolve8_avg;
     }
   }
   // 2D subpel motion always gets filtered in both directions
-  scale_comm->predict[1][1][0] = vp9_convolve8;
-  scale_comm->predict[1][1][1] = vp9_convolve8_avg;
-
-  scale->sfc = scale_comm;
-  scale->x_offset_q4 = 0;  // calculated per block
-  scale->y_offset_q4 = 0;  // calculated per block
+  sf->predict[1][1][0] = vp9_convolve8;
+  sf->predict[1][1][1] = vp9_convolve8_avg;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.h
index 1437fcd9c7e..a9dda1889df 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scale.h
@@ -14,44 +14,44 @@
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_convolve.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define REF_SCALE_SHIFT 14
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
 #define REF_INVALID_SCALE -1
 
-struct scale_factors;
-struct scale_factors_common {
+struct scale_factors {
   int x_scale_fp;   // horizontal fixed point scale factor
   int y_scale_fp;   // vertical fixed point scale factor
   int x_step_q4;
   int y_step_q4;
 
-  int (*scale_value_x)(int val, const struct scale_factors_common *sfc);
-  int (*scale_value_y)(int val, const struct scale_factors_common *sfc);
-  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 };
 
-struct scale_factors {
-  int x_offset_q4;
-  int y_offset_q4;
-  const struct scale_factors_common *sfc;
-};
+MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       struct scale_factors_common *scale_comm,
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
 
-static int vp9_is_valid_scale(const struct scale_factors_common *sfc) {
-  return sfc->x_scale_fp != REF_INVALID_SCALE &&
-         sfc->y_scale_fp != REF_INVALID_SCALE;
+static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static int vp9_is_scaled(const struct scale_factors_common *sfc) {
-  return sfc->x_scale_fp != REF_NO_SCALE ||
-         sfc->y_scale_fp != REF_NO_SCALE;
+static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_NO_SCALE ||
+         sf->y_scale_fp != REF_NO_SCALE;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.c
index f17da91104b..1ec5a0cf36c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.c
@@ -12,28 +12,28 @@
 
 #include "vp9/common/vp9_scan.h"
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
   0,  4,  1,  5,
   8,  2, 12,  9,
   3,  6, 13, 10,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0,  4,  8,  1,
   12,  5,  9,  2,
   13,  6, 10,  3,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
   0,  1,  4,  2,
   5,  3,  6,  8,
   9,  7, 12, 10,
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -44,7 +44,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
   2, 40, 25, 10, 33, 18, 48,  3,
   26, 41, 11, 56, 19, 34,  4, 49,
@@ -55,7 +55,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
   31, 61, 39, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
   0,  1,  2,  8,  9,  3, 16, 10,
   4, 17, 11, 24,  5, 18, 25, 12,
   19, 26, 32,  6, 13, 20, 33, 27,
@@ -66,7 +66,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
   60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
   0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
   50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
   98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
@@ -87,7 +87,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
   34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
   67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
@@ -108,7 +108,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
   0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
   49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
   23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
@@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
   129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
   68, 131, 37, 100,
@@ -233,38 +233,68 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static  int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]);
 
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+const scan_order vp9_default_scan_orders[TX_SIZES] = {
+  {default_scan_4x4,   vp9_default_iscan_4x4,   default_scan_4x4_neighbors},
+  {default_scan_8x8,   vp9_default_iscan_8x8,   default_scan_8x8_neighbors},
+  {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+  {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
+const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp9_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp9_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}
+  }, {  // TX_8X8
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp9_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp9_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}
+  }, {  // TX_16X16
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp9_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp9_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}
+  }, {  // TX_32X32
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+  }
+};
 
 static int find_in_scan(const int16_t *scan, int l, int idx) {
   int n, l2 = l * l;
@@ -276,9 +306,9 @@ static int find_in_scan(const int16_t *scan, int l, int idx) {
   assert(0);
   return -1;
 }
-static void init_scan_neighbors(const int16_t *scan,
-                                int16_t *iscan,
-                                int l, int16_t *neighbors) {
+
+static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l,
+                                int16_t *neighbors) {
   int l2 = l * l;
   int n, i, j;
 
@@ -302,15 +332,15 @@ static void init_scan_neighbors(const int16_t *scan,
       // use the combination of the two as a context.
       int a = (i - 1) * l + j;
       int b =  i      * l + j - 1;
-      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
-          scan == vp9_col_scan_16x16) {
+      if (scan == col_scan_4x4 || scan == col_scan_8x8 ||
+          scan == col_scan_16x16) {
         // in the col/row scan cases (as well as left/top edge cases), we set
         // both contexts to the same value, so we can branchlessly do a+b+1>>1
         // which automatically becomes a if a == b
         neighbors[MAX_NEIGHBORS * n + 0] =
         neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
-                 scan == vp9_row_scan_16x16) {
+      } else if (scan == row_scan_4x4 || scan == row_scan_8x8 ||
+                 scan == row_scan_16x16) {
         neighbors[MAX_NEIGHBORS * n + 0] =
         neighbors[MAX_NEIGHBORS * n + 1] = b;
       } else {
@@ -334,24 +364,24 @@ static void init_scan_neighbors(const int16_t *scan,
 }
 
 void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors);
+  init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4,
+                      default_scan_4x4_neighbors);
+  init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4,
+                      row_scan_4x4_neighbors);
+  init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4,
+                      col_scan_4x4_neighbors);
+  init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8,
+                      default_scan_8x8_neighbors);
+  init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8,
+                      row_scan_8x8_neighbors);
+  init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8,
+                      col_scan_8x8_neighbors);
+  init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16,
+                      default_scan_16x16_neighbors);
+  init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16,
+                      row_scan_16x16_neighbors);
+  init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16,
+                      col_scan_16x16_neighbors);
+  init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32,
+                      default_scan_32x32_neighbors);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h
index a5c8463d571..9613b675c2e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h
@@ -15,186 +15,33 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
 
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+#define MAX_NEIGHBORS 2
 
 void vp9_init_neighbors();
 
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_4x4;
-    case DCT_ADST:
-      return vp9_col_scan_4x4;
-    default:
-      return vp9_default_scan_4x4;
-  }
-}
-
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_4x4;
-      *nb = vp9_row_scan_4x4_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_4x4;
-      *nb = vp9_col_scan_4x4_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_4x4;
-      *nb = vp9_default_scan_4x4_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_4x4;
-    case DCT_ADST:
-      return vp9_col_iscan_4x4;
-    default:
-      return vp9_default_iscan_4x4;
-  }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_8x8;
-    case DCT_ADST:
-      return vp9_col_scan_8x8;
-    default:
-      return vp9_default_scan_8x8;
-  }
-}
-
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_8x8;
-      *nb = vp9_row_scan_8x8_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_8x8;
-      *nb = vp9_col_scan_8x8_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_8x8;
-      *nb = vp9_default_scan_8x8_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_8x8;
-    case DCT_ADST:
-      return vp9_col_iscan_8x8;
-    default:
-      return vp9_default_iscan_8x8;
-  }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_16x16;
-    case DCT_ADST:
-      return vp9_col_scan_16x16;
-    default:
-      return vp9_default_scan_16x16;
-  }
-}
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} scan_order;
 
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
-                                     const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_16x16;
-      *nb = vp9_row_scan_16x16_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_16x16;
-      *nb = vp9_col_scan_16x16_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_16x16;
-      *nb = vp9_default_scan_16x16_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_16x16;
-    case DCT_ADST:
-      return vp9_col_iscan_16x16;
-    default:
-      return vp9_default_iscan_16x16;
-  }
-}
+extern const scan_order vp9_default_scan_orders[TX_SIZES];
+extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
+                                   const uint8_t *token_cache, int c) {
   return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SCAN_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c
index ef30404b454..910200ecc9c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c
@@ -41,11 +41,6 @@ void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
   seg->feature_mask[segment_id] |= 1 << feature_id;
 }
 
-void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
-                            SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
-}
-
 int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
   return seg_feature_data_max[feature_id];
 }
@@ -54,11 +49,6 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   return seg_feature_data_signed[feature_id];
 }
 
-void vp9_clear_segdata(struct segmentation *seg, int segment_id,
-                       SEG_LVL_FEATURES feature_id) {
-  seg->feature_data[segment_id][feature_id] = 0;
-}
-
 void vp9_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data) {
   assert(seg_data <= seg_feature_data_max[feature_id]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h
index eb38c06be5a..ff2d66a3658 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h
@@ -11,7 +11,11 @@
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-#include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
@@ -55,18 +59,10 @@ void vp9_enable_segfeature(struct segmentation *seg,
                            int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
-void vp9_disable_segfeature(struct segmentation *seg,
-                            int segment_id,
-                            SEG_LVL_FEATURES feature_id);
-
 int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 
 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
 
-void vp9_clear_segdata(struct segmentation *seg,
-                       int segment_id,
-                       SEG_LVL_FEATURES feature_id);
-
 void vp9_set_segdata(struct segmentation *seg,
                      int segment_id,
                      SEG_LVL_FEATURES feature_id,
@@ -78,5 +74,9 @@ int vp9_get_segdata(const struct segmentation *seg,
 
 extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
index 254a431a300..e9711582303 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
@@ -12,8 +12,16 @@
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
 #ifdef _MSC_VER
-#include <math.h>
-#define snprintf _snprintf
+# include <math.h>  // the ceil() definition must precede intrin.h
+# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+#  include <intrin.h>
+#  define USE_MSC_INTRIN
+# endif
+# define snprintf _snprintf
+#endif
+
+#ifdef __cplusplus
+extern "C" {
 #endif
 
 #include "./vpx_config.h"
@@ -26,7 +34,7 @@ void vpx_reset_mmx_state(void);
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 // round is not defined in MSVC before VS2013.
-static int round(double x) {
+static INLINE int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
   else
@@ -34,7 +42,42 @@ static int round(double x) {
 }
 #endif
 
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *cm);
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_msb(unsigned int n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(USE_MSC_INTRIN)
+#pragma intrinsic(_BitScanReverse)
+
+static INLINE int get_msb(unsigned int n) {
+  unsigned long first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#undef USE_MSC_INTRIN
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static INLINE int get_msb(unsigned int n) {
+  int log = 0;
+  unsigned int value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const unsigned int x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tapify.py b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tapify.py
deleted file mode 100644
index 99529cff0c2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tapify.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-"""
-#!/usr/bin/env python
-import sys,string,os,re,math,numpy
-scale = 2**16
-def dist(p1,p2):
-  x1,y1 = p1
-  x2,y2 = p2
-  if x1==x2 and y1==y2 :
-    return 1.0 
-  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
-
-def gettaps(p):
-  def l(b):
-    return int(math.floor(b))
-  def h(b):
-    return int(math.ceil(b))
-  def t(b,p,s):
-    return int((scale*dist(b,p)+s/2)/s)
-  r,c = p
-  ul=[l(r),l(c)]
-  ur=[l(r),h(c)]
-  ll=[h(r),l(c)]
-  lr=[h(r),h(c)]
-  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
-  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
-  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
-          [ll,t(ll,p,sum)],[lr,t4]]
-
-def print_mb_taps(angle,blocksize):
-  theta = angle / 57.2957795;
-  affine = [[math.cos(theta),-math.sin(theta)],
-            [math.sin(theta),math.cos(theta)]]
-  radius = (float(blocksize)-1)/2
-  print " // angle of",angle,"degrees"
-  for y in range(blocksize) :
-    for x in range(blocksize) :
-      r,c = numpy.dot(affine,[y-radius, x-radius])
-      tps = gettaps([r+radius,c+radius])
-      for t in tps :
-        p,t = t
-        tr,tc = p
-        print " %2d, %2d, %5d, " % (tr,tc,t,),
-      print " // %2d,%2d " % (y,x)
-
-i=float(sys.argv[1])
-while  i <= float(sys.argv[2]) :
-  print_mb_taps(i,float(sys.argv[4]))
-  i=i+float(sys.argv[3])
-"""
-
-taps = []
-pt=dict()
-ptr=dict()
-for y in range(16) :
-  for x in range(16) :
-    r,c = numpy.dot(affine,[y-7.5, x-7.5])
-    tps = gettaps([r+7.5,c+7.5])
-    j=0
-    for tp in tps : 
-      p,i = tp
-      r,c = p
-      pt[y,x,j]= [p,i]
-      try: 
-        ptr[r,j,c].append([y,x])
-      except:
-        ptr[r,j,c]=[[y,x]]
-      j = j+1 
-
-for key in sorted(pt.keys()) :
-  print key,pt[key]
-
-lr = -99
-lj = -99 
-lc = 0
-
-shuf=""
-mask=""
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    if lr != r or lj != j :
-      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
-      shuf=""
-      lc = 0
-    for i in range(lc,c-1) :
-      shuf = shuf +"0"
-    shuf = shuf + hex(x)[2]
-    lc =c
-    break
-  lr = r
-  lj = j
-#  print r,j,c,ptr[r,j,c]    
-#  print 
-
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    print r,j,c,y,x 
-    break
-"""
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_textblit.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_textblit.h
index c968628fe42..158ec1b37ed 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_textblit.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_textblit.h
@@ -11,9 +11,17 @@
 #ifndef VP9_COMMON_VP9_TEXTBLIT_H_
 #define VP9_COMMON_VP9_TEXTBLIT_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
 
 void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
                    int pitch);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
index e3035d076bc..78909dd9bea 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
@@ -15,46 +15,37 @@
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
 
-static int to_sbs(n_mis) {
-  return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
+static int get_tile_offset(int idx, int mis, int log2) {
+  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
+  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
+  return MIN(offset, mis);
 }
 
-static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
-                             int tile_idx, int log2_n_tiles, int n_mis) {
-  const int n_sbs = to_sbs(n_mis);
-  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
-  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
-
-  *min_tile_off = MIN(sb_off1 << 3, n_mis);
-  *max_tile_off = MIN(sb_off2 << 3, n_mis);
-}
-
-void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
-                   int row_idx, int col_idx) {
-  get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
-                   row_idx, cm->log2_tile_rows, cm->mi_rows);
-  get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
-                   col_idx, cm->log2_tile_cols, cm->mi_cols);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
 }
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = to_sbs(mi_cols);
-  int min_log2_n_tiles, max_log2_n_tiles;
+  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int min_log2 = 0, max_log2 = 0;
 
-  for (max_log2_n_tiles = 0;
-       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64;
-       max_log2_n_tiles++) {}
-  max_log2_n_tiles--;
-  if (max_log2_n_tiles <  0)
-    max_log2_n_tiles = 0;
+  // max
+  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  --max_log2;
+  if (max_log2 < 0)
+    max_log2 = 0;
 
-  for (min_log2_n_tiles = 0;
-       (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols;
-       min_log2_n_tiles++) {}
+  // min
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+    ++min_log2;
 
-  assert(min_log2_n_tiles <= max_log2_n_tiles);
+  assert(min_log2 <= max_log2);
 
-  *min_log2_tile_cols = min_log2_n_tiles;
-  *max_log2_tile_cols = max_log2_n_tiles;
+  *min_log2_tile_cols = min_log2;
+  *max_log2_tile_cols = max_log2;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h
index a110abbdb9a..a97719e2947 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h
@@ -11,6 +11,10 @@
 #ifndef VP9_COMMON_VP9_TILE_COMMON_H_
 #define VP9_COMMON_VP9_TILE_COMMON_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 
 typedef struct TileInfo {
@@ -18,12 +22,16 @@ typedef struct TileInfo {
   int mi_col_start, mi_col_end;
 } TileInfo;
 
-// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
 // 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
 void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
-                   int row_idx, int col_idx);
+                   int row, int col);
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_treecoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_treecoder.c
deleted file mode 100644
index da1213d7153..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_treecoder.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vp9/common/vp9_treecoder.h"
-
-static void tree2tok(struct vp9_token *const p, vp9_tree t,
-                    int i, int v, int l) {
-  v += v;
-  ++l;
-
-  do {
-    const vp9_tree_index j = t[i++];
-
-    if (j <= 0) {
-      p[-j].value = v;
-      p[-j].len = l;
-    } else {
-      tree2tok(p, t, j, v, l);
-    }
-  } while (++v & 1);
-}
-
-void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) {
-  tree2tok(p, t, 0, 0, 0);
-}
-
-void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
-                                 int offset) {
-  tree2tok(p - offset, t, 0, 0, 0);
-}
-
-static unsigned int convert_distribution(unsigned int i,
-                                         vp9_tree tree,
-                                         vp9_prob probs[],
-                                         unsigned int branch_ct[][2],
-                                         const unsigned int num_events[],
-                                         unsigned int tok0_offset) {
-  unsigned int left, right;
-
-  if (tree[i] <= 0) {
-    left = num_events[-tree[i] - tok0_offset];
-  } else {
-    left = convert_distribution(tree[i], tree, probs, branch_ct,
-                                num_events, tok0_offset);
-  }
-  if (tree[i + 1] <= 0)
-    right = num_events[-tree[i + 1] - tok0_offset];
-  else
-    right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                 num_events, tok0_offset);
-
-  probs[i>>1] = get_binary_prob(left, right);
-  branch_ct[i>>1][0] = left;
-  branch_ct[i>>1][1] = right;
-  return left + right;
-}
-
-void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
-                                      unsigned int branch_ct[/* n-1 */][2],
-                                      const unsigned int num_events[/* n */],
-                                      unsigned int tok0_offset) {
-  convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 106e6d426f7..1b4904c3936 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -13,45 +13,205 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_ports/mem.h"
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
-  { 128, 128, 128, 128,  0,  0,  0,  0 },
-  { 120, 120, 120, 120,  8,  8,  8,  8 },
-  { 112, 112, 112, 112, 16, 16, 16, 16 },
-  { 104, 104, 104, 104, 24, 24, 24, 24 },
-  {  96, 96, 96, 96, 32, 32, 32, 32 },
-  {  88, 88, 88, 88, 40, 40, 40, 40 },
-  {  80, 80, 80, 80, 48, 48, 48, 48 },
-  {  72, 72, 72, 72, 56, 56, 56, 56 },
-  {  64, 64, 64, 64, 64, 64, 64, 64 },
-  {  56, 56, 56, 56, 72, 72, 72, 72 },
-  {  48, 48, 48, 48, 80, 80, 80, 80 },
-  {  40, 40, 40, 40, 88, 88, 88, 88 },
-  {  32, 32, 32, 32, 96, 96, 96, 96 },
-  {  24, 24, 24, 24, 104, 104, 104, 104 },
-  {  16, 16, 16, 16, 112, 112, 112, 112 },
-  {   8,  8,  8,  8, 120, 120, 120, 120 }
-};
 
 typedef void filter8_1dfunction (
   const unsigned char *src_ptr,
-  const unsigned int src_pitch,
+  const ptrdiff_t src_pitch,
   unsigned char *output_ptr,
-  unsigned int out_pitch,
+  ptrdiff_t out_pitch,
   unsigned int output_height,
   const short *filter
 );
 
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                   uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const int16_t *filter_x, int x_step_q4, \
+                                   const int16_t *filter_y, int y_step_q4, \
+                                   int w, int h) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+                             filter_x, x_step_q4, filter_y, y_step_q4, \
+                             w, h); \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 7); \
+      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } else { \
+      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
+      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 1); \
+      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } \
+  } else { \
+    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+  } \
+}
+#if HAVE_AVX2
+filter8_1dfunction vp9_filter_block1d16_v8_avx2;
+filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif
 #if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -59,201 +219,57 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
-
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
 
-void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
 
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
 #endif
 
 #if HAVE_SSE2
@@ -270,199 +286,54 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
 
-void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
+filter8_1dfunction vp9_filter_block1d16_v2_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
 
-void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
+// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
-void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
-
-void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_copy_sse2.asm
index dd522c698dc..b26383708f7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_copy_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_copy_sse2.asm
@@ -133,10 +133,14 @@ INIT_MMX sse
   movh                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+dst_strideq]
-  pavgb                   m2, [dstq+dst_strideq*2]
-  pavgb                   m3, [dstq+r6q]
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavgb                   m0, m4
+  pavgb                   m1, m5
+  pavgb                   m2, m6
+  pavgb                   m3, m7
 %endif
   movh  [dstq              ], m0
   movh  [dstq+dst_strideq  ], m1
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index ccf5aac179e..0231726dcff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,6 +15,16 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)dest = _mm_cvtsi128_si32(d0); \
+  dest += stride; \
+}
+
 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
@@ -26,21 +36,19 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((const __m128i *)input);
-  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input1 = _mm_shufflelo_epi16(input1, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
   input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
 
+  input1 = _mm_unpackhi_epi32(input0, input0);
   input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpacklo_epi32(input1, input1);
+  input3 = _mm_unpackhi_epi32(input2, input2);
   input2 = _mm_unpacklo_epi32(input2, input2);
-  input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
   input0 = _mm_madd_epi16(input0, cst);
@@ -59,16 +67,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -78,14 +84,9 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   // Columns
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input2, 0xd8);
-  input1 = _mm_shufflehi_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input3, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
-
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpackhi_epi32(input1, input1);
-  input2 = _mm_unpackhi_epi32(input2, input2);
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
   input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
@@ -105,16 +106,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -129,23 +128,31 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-#define RECON_AND_STORE4X4(dest, in_x) \
-  {                                                     \
-      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      *(int *)dest = _mm_cvtsi128_si32(d0); \
-      dest += stride; \
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)), d2);
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, input2);
+     d2 = _mm_add_epi16(d2, input3);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store input0
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store input1
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store input2
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+     // store input3
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
   }
-
-  input0 = _mm_srli_si128(input2, 8);
-  input1 = _mm_srli_si128(input3, 8);
-
-  RECON_AND_STORE4X4(dest, input2);
-  RECON_AND_STORE4X4(dest, input0);
-  RECON_AND_STORE4X4(dest, input1);
-  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
@@ -167,15 +174,13 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
 static INLINE void transpose_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
-static void idct4_1d_sse2(__m128i *in) {
+static void idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -185,8 +190,8 @@ static void idct4_1d_sse2(__m128i *in) {
 
   transpose_4x4(in);
   // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
@@ -202,19 +207,16 @@ static void idct4_1d_sse2(__m128i *in) {
   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
 
-  u[0] = _mm_packs_epi32(v[0], v[2]);
-  u[1] = _mm_packs_epi32(v[1], v[3]);
-  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
-  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
 
   // stage 2
-  in[0] = _mm_add_epi16(u[0], u[3]);
-  in[1] = _mm_add_epi16(u[1], u[2]);
-  in[2] = _mm_sub_epi16(u[1], u[2]);
-  in[3] = _mm_sub_epi16(u[0], u[3]);
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-static void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -225,13 +227,14 @@ static void iadst4_1d_sse2(__m128i *in) {
   __m128i u[8], v[8], in7;
 
   transpose_4x4(in);
-  in7 = _mm_add_epi16(in[0], in[3]);
-  in7 = _mm_sub_epi16(in7, in[2]);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
 
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[1], kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
@@ -258,39 +261,35 @@ static void iadst4_1d_sse2(__m128i *in) {
   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
 
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
-  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  __m128i in[4];
+  __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((const __m128i *)input);
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
+  in[0]= _mm_loadu_si128((const __m128i *)(input));
+  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      idct4_sse2(in);
+      idct4_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      idct4_sse2(in);
+      iadst4_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      iadst4_sse2(in);
+      idct4_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      iadst4_sse2(in);
+      iadst4_sse2(in);
       break;
     default:
       assert(0);
@@ -300,18 +299,35 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   // Final round and shift
   in[0] = _mm_add_epi16(in[0], eight);
   in[1] = _mm_add_epi16(in[1], eight);
-  in[2] = _mm_add_epi16(in[2], eight);
-  in[3] = _mm_add_epi16(in[3], eight);
 
   in[0] = _mm_srai_epi16(in[0], 4);
   in[1] = _mm_srai_epi16(in[1], 4);
-  in[2] = _mm_srai_epi16(in[2], 4);
-  in[3] = _mm_srai_epi16(in[3], 4);
 
-  RECON_AND_STORE4X4(dest, in[0]);
-  RECON_AND_STORE4X4(dest, in[1]);
-  RECON_AND_STORE4X4(dest, in[2]);
-  RECON_AND_STORE4X4(dest, in[3]);
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)));
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, in[0]);
+     d2 = _mm_add_epi16(d2, in[1]);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store result[0]
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store result[1]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store result[2]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+     // store result[3]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
 }
 
 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
@@ -345,37 +361,40 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
   }
 
-#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                      out0, out1, out2, out3, out4, out5, out6, out7) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
-                                                        \
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+                         out0, out1, out2, out3) \
+  {                                              \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+    \
     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-                                                            \
+    \
     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    out4 = out5 = out6 = out7 = zero; \
   }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
   {                                                     \
     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
                                                         \
     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
     in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
-    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
+  }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+  {                                            \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
   }
 
 // Define Macro for multiplying elements by constants and adding them together.
@@ -415,7 +434,30 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
-#define IDCT8_1D  \
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                 out0, out1, out2, out3, out4, out5, out6, out7)  \
+  { \
   /* Stage1 */      \
   { \
     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -475,14 +517,15 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   } \
   \
   /* Stage4  */ \
-  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
-  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
-  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
-  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
-  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
-  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
-  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
-  in7 = _mm_subs_epi16(stp1_0, stp2_7);
+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+  }
 
 #define RECON_AND_STORE(dest, in_x) \
   {                                                     \
@@ -526,11 +569,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   // 2-D
   for (i = 0; i < 2; i++) {
     // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+                  in0, in1, in2, in3, in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8_1D
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+             in0, in1, in2, in3, in4, in5, in6, in7);
   }
 
   // Final rounding and shift
@@ -613,7 +657,24 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
-static void idct8_1d_sse2(__m128i *in) {
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static void idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -629,32 +690,16 @@ static void idct8_1d_sse2(__m128i *in) {
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
-  in0 = in[0];
-  in1 = in[1];
-  in2 = in[2];
-  in3 = in[3];
-  in4 = in[4];
-  in5 = in[5];
-  in6 = in[6];
-  in7 = in[7];
-
   // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
-  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                in4, in5, in6, in7);
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                in0, in1, in2, in3, in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8_1D
-  in[0] = in0;
-  in[1] = in1;
-  in[2] = in2;
-  in[3] = in3;
-  in[4] = in4;
-  in[5] = in5;
-  in[6] = in6;
-  in[7] = in7;
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
 }
 
-static void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -901,20 +946,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      idct8_sse2(in);
+      idct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      idct8_sse2(in);
+      iadst8_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      iadst8_sse2(in);
+      idct8_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      iadst8_sse2(in);
+      iadst8_sse2(in);
       break;
     default:
       assert(0);
@@ -950,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -976,12 +1021,11 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
 
   // 8x4 Transpose
-  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
-
+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
   // Stage1
   { //NOLINT
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
 
     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
@@ -997,16 +1041,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
 
-    stp1_4 = _mm_packs_epi32(tmp0, zero);
-    stp1_7 = _mm_packs_epi32(tmp2, zero);
-    stp1_5 = _mm_packs_epi32(tmp4, zero);
-    stp1_6 = _mm_packs_epi32(tmp6, zero);
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
   }
 
   // Stage2
   { //NOLINT
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
 
     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
@@ -1022,24 +1064,26 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
 
-    stp2_0 = _mm_packs_epi32(tmp0, zero);
-    stp2_1 = _mm_packs_epi32(tmp2, zero);
-    stp2_2 = _mm_packs_epi32(tmp4, zero);
-    stp2_3 = _mm_packs_epi32(tmp6, zero);
+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
 
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+    stp2_4 = tmp0;
+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
   }
 
   // Stage3
   { //NOLINT
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
+
+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
 
     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
@@ -1049,27 +1093,19 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
 
-    stp1_5 = _mm_packs_epi32(tmp0, zero);
-    stp1_6 = _mm_packs_epi32(tmp2, zero);
+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
   }
 
   // Stage4
-  in0 = _mm_adds_epi16(stp1_0, stp2_7);
-  in1 = _mm_adds_epi16(stp1_1, stp1_6);
-  in2 = _mm_adds_epi16(stp1_2, stp1_5);
-  in3 = _mm_adds_epi16(stp1_3, stp2_4);
-  in4 = _mm_subs_epi16(stp1_3, stp2_4);
-  in5 = _mm_subs_epi16(stp1_2, stp1_5);
-  in6 = _mm_subs_epi16(stp1_1, stp1_6);
-  in7 = _mm_subs_epi16(stp1_0, stp2_7);
-
-  // Columns. 4x8 Transpose
-  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                in4, in5, in6, in7)
-
-  // 1D idct8x8
-  IDCT8_1D
+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
 
+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+           in0, in1, in2, in3, in4, in5, in6, in7);
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
   in1 = _mm_adds_epi16(in1, final_rounding);
@@ -1099,17 +1135,17 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16_1D \
+#define IDCT16 \
   /* Stage2 */ \
   { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
     \
     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
                            stg2_0, stg2_1, stg2_2, stg2_3, \
@@ -1122,10 +1158,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     \
   /* Stage3 */ \
   { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
     \
     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
                            stg3_0, stg3_1, stg3_2, stg3_3, \
@@ -1144,10 +1180,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   \
   /* Stage4 */ \
   { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
     \
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -1228,6 +1264,114 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
+#define IDCT16_10 \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1259,16 +1403,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
-          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
-          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+  __m128i in[16], l[16], r[16], *curr1;
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
@@ -1277,162 +1412,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
 
-  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
-  for (i = 0; i < 4; i++) {
-    // 1-D idct
-    if (i < 2) {
-      if (i == 1) input += 128;
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
 
       // Load input data.
-      in0 = _mm_load_si128((const __m128i *)input);
-      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-    }
-
-    if (i == 2) {
-      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
-                    in13, in14, in15);
-    }
-
-    if (i == 3) {
-      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
-                    in12, in13, in14, in15);
-    }
+      in[0] = _mm_load_si128((const __m128i *)input);
+      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+
+      IDCT16
+
+      // Stage7
+      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+      curr1 = r;
+      input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
+      array_transpose_8x8(l+i*8, in);
+      array_transpose_8x8(r+i*8, in+8);
 
-    IDCT16_1D
+      IDCT16
 
-    // Stage7
-    if (i == 0) {
-      // Left 8x16
-      l0 = _mm_add_epi16(stp2_0, stp1_15);
-      l1 = _mm_add_epi16(stp2_1, stp1_14);
-      l2 = _mm_add_epi16(stp2_2, stp2_13);
-      l3 = _mm_add_epi16(stp2_3, stp2_12);
-      l4 = _mm_add_epi16(stp2_4, stp2_11);
-      l5 = _mm_add_epi16(stp2_5, stp2_10);
-      l6 = _mm_add_epi16(stp2_6, stp1_9);
-      l7 = _mm_add_epi16(stp2_7, stp1_8);
-      l8 = _mm_sub_epi16(stp2_7, stp1_8);
-      l9 = _mm_sub_epi16(stp2_6, stp1_9);
-      l10 = _mm_sub_epi16(stp2_5, stp2_10);
-      l11 = _mm_sub_epi16(stp2_4, stp2_11);
-      l12 = _mm_sub_epi16(stp2_3, stp2_12);
-      l13 = _mm_sub_epi16(stp2_2, stp2_13);
-      l14 = _mm_sub_epi16(stp2_1, stp1_14);
-      l15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else if (i == 1) {
-      // Right 8x16
-      r0 = _mm_add_epi16(stp2_0, stp1_15);
-      r1 = _mm_add_epi16(stp2_1, stp1_14);
-      r2 = _mm_add_epi16(stp2_2, stp2_13);
-      r3 = _mm_add_epi16(stp2_3, stp2_12);
-      r4 = _mm_add_epi16(stp2_4, stp2_11);
-      r5 = _mm_add_epi16(stp2_5, stp2_10);
-      r6 = _mm_add_epi16(stp2_6, stp1_9);
-      r7 = _mm_add_epi16(stp2_7, stp1_8);
-      r8 = _mm_sub_epi16(stp2_7, stp1_8);
-      r9 = _mm_sub_epi16(stp2_6, stp1_9);
-      r10 = _mm_sub_epi16(stp2_5, stp2_10);
-      r11 = _mm_sub_epi16(stp2_4, stp2_11);
-      r12 = _mm_sub_epi16(stp2_3, stp2_12);
-      r13 = _mm_sub_epi16(stp2_2, stp2_13);
-      r14 = _mm_sub_epi16(stp2_1, stp1_14);
-      r15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else {
       // 2-D
-      in0 = _mm_add_epi16(stp2_0, stp1_15);
-      in1 = _mm_add_epi16(stp2_1, stp1_14);
-      in2 = _mm_add_epi16(stp2_2, stp2_13);
-      in3 = _mm_add_epi16(stp2_3, stp2_12);
-      in4 = _mm_add_epi16(stp2_4, stp2_11);
-      in5 = _mm_add_epi16(stp2_5, stp2_10);
-      in6 = _mm_add_epi16(stp2_6, stp1_9);
-      in7 = _mm_add_epi16(stp2_7, stp1_8);
-      in8 = _mm_sub_epi16(stp2_7, stp1_8);
-      in9 = _mm_sub_epi16(stp2_6, stp1_9);
-      in10 = _mm_sub_epi16(stp2_5, stp2_10);
-      in11 = _mm_sub_epi16(stp2_4, stp2_11);
-      in12 = _mm_sub_epi16(stp2_3, stp2_12);
-      in13 = _mm_sub_epi16(stp2_2, stp2_13);
-      in14 = _mm_sub_epi16(stp2_1, stp1_14);
-      in15 = _mm_sub_epi16(stp2_0, stp1_15);
+      in[0] = _mm_add_epi16(stp2_0, stp1_15);
+      in[1] = _mm_add_epi16(stp2_1, stp1_14);
+      in[2] = _mm_add_epi16(stp2_2, stp2_13);
+      in[3] = _mm_add_epi16(stp2_3, stp2_12);
+      in[4] = _mm_add_epi16(stp2_4, stp2_11);
+      in[5] = _mm_add_epi16(stp2_5, stp2_10);
+      in[6] = _mm_add_epi16(stp2_6, stp1_9);
+      in[7] = _mm_add_epi16(stp2_7, stp1_8);
+      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
 
       dest += 8 - (stride * 16);
-    }
   }
 }
 
@@ -1485,7 +1590,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   res0[15] = tbuf[7];
 }
 
-static void iadst16_1d_8col(__m128i *in) {
+static void iadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1955,7 +2060,7 @@ static void iadst16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_1d_8col(__m128i *in) {
+static void idct16_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2299,16 +2404,16 @@ static void idct16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  idct16_1d_8col(in0);
-  idct16_1d_8col(in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
 }
 
-static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  iadst16_1d_8col(in0);
-  iadst16_1d_8col(in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
 }
 
 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
@@ -2397,20 +2502,20 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      idct16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      iadst16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      iadst16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     default:
       assert(0);
@@ -2430,149 +2535,87 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
 
   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
 
   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+  __m128i in[16], l[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-  // 1-D idct. Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
 
-  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
 
   // Stage2
   {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
-    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
-    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
 
     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
-    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
-    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
     tmp5 = _mm_add_epi32(tmp5, rounding);
     tmp7 = _mm_add_epi32(tmp7, rounding);
 
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
 
-    stp2_8 = _mm_packs_epi32(tmp0, zero);
-    stp2_15 = _mm_packs_epi32(tmp2, zero);
-    stp2_9 = _mm_packs_epi32(tmp4, zero);
-    stp2_14 = _mm_packs_epi32(tmp6, zero);
-
-    stp2_10 = _mm_packs_epi32(tmp1, zero);
-    stp2_13 = _mm_packs_epi32(tmp3, zero);
-    stp2_11 = _mm_packs_epi32(tmp5, zero);
-    stp2_12 = _mm_packs_epi32(tmp7, zero);
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
   }
 
   // Stage3
   {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
 
     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
-    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, zero);
-    stp1_7 = _mm_packs_epi32(tmp2, zero);
-    stp1_5 = _mm_packs_epi32(tmp4, zero);
-    stp1_6 = _mm_packs_epi32(tmp6, zero);
 
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
 
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
   }
 
   // Stage4
   {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
 
     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
-    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
@@ -2580,8 +2623,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
     tmp1 = _mm_add_epi32(tmp1, rounding);
     tmp3 = _mm_add_epi32(tmp3, rounding);
     tmp5 = _mm_add_epi32(tmp5, rounding);
@@ -2589,49 +2630,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
 
-    stp2_0 = _mm_packs_epi32(tmp0, zero);
-    stp2_1 = _mm_packs_epi32(tmp2, zero);
-    stp2_2 = _mm_packs_epi32(tmp4, zero);
-    stp2_3 = _mm_packs_epi32(tmp6, zero);
-    stp2_9 = _mm_packs_epi32(tmp1, zero);
-    stp2_14 = _mm_packs_epi32(tmp3, zero);
-    stp2_10 = _mm_packs_epi32(tmp5, zero);
-    stp2_13 = _mm_packs_epi32(tmp7, zero);
-
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
   }
 
   // Stage5 and Stage6
   {
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
   }
 
   // Stage6
   {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
 
@@ -2656,124 +2688,121 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
 
-    stp1_5 = _mm_packs_epi32(tmp1, zero);
-    stp1_6 = _mm_packs_epi32(tmp3, zero);
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
     stp2_10 = _mm_packs_epi32(tmp0, zero);
     stp2_13 = _mm_packs_epi32(tmp2, zero);
     stp2_11 = _mm_packs_epi32(tmp4, zero);
     stp2_12 = _mm_packs_epi32(tmp6, zero);
 
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
   }
 
   // Stage7. Left 8x16 only.
-  l0 = _mm_add_epi16(stp2_0, stp1_15);
-  l1 = _mm_add_epi16(stp2_1, stp1_14);
-  l2 = _mm_add_epi16(stp2_2, stp2_13);
-  l3 = _mm_add_epi16(stp2_3, stp2_12);
-  l4 = _mm_add_epi16(stp2_4, stp2_11);
-  l5 = _mm_add_epi16(stp2_5, stp2_10);
-  l6 = _mm_add_epi16(stp2_6, stp1_9);
-  l7 = _mm_add_epi16(stp2_7, stp1_8);
-  l8 = _mm_sub_epi16(stp2_7, stp1_8);
-  l9 = _mm_sub_epi16(stp2_6, stp1_9);
-  l10 = _mm_sub_epi16(stp2_5, stp2_10);
-  l11 = _mm_sub_epi16(stp2_4, stp2_11);
-  l12 = _mm_sub_epi16(stp2_3, stp2_12);
-  l13 = _mm_sub_epi16(stp2_2, stp2_13);
-  l14 = _mm_sub_epi16(stp2_1, stp1_14);
-  l15 = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // 2-D idct. We do 2 8x16 blocks.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // Second 1-D inverse transform, performed per 8x16 block
   for (i = 0; i < 2; i++) {
-    if (i == 0)
-      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-
-    if (i == 1)
-      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-
-    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+    array_transpose_4X8(l + 8*i, in);
 
-    IDCT16_1D
+    IDCT16_10
 
     // Stage7
-    in0 = _mm_add_epi16(stp2_0, stp1_15);
-    in1 = _mm_add_epi16(stp2_1, stp1_14);
-    in2 = _mm_add_epi16(stp2_2, stp2_13);
-    in3 = _mm_add_epi16(stp2_3, stp2_12);
-    in4 = _mm_add_epi16(stp2_4, stp2_11);
-    in5 = _mm_add_epi16(stp2_5, stp2_10);
-    in6 = _mm_add_epi16(stp2_6, stp1_9);
-    in7 = _mm_add_epi16(stp2_7, stp1_8);
-    in8 = _mm_sub_epi16(stp2_7, stp1_8);
-    in9 = _mm_sub_epi16(stp2_6, stp1_9);
-    in10 = _mm_sub_epi16(stp2_5, stp2_10);
-    in11 = _mm_sub_epi16(stp2_4, stp2_11);
-    in12 = _mm_sub_epi16(stp2_3, stp2_12);
-    in13 = _mm_sub_epi16(stp2_2, stp2_13);
-    in14 = _mm_sub_epi16(stp2_1, stp1_14);
-    in15 = _mm_sub_epi16(stp2_0, stp1_15);
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
     // Final rounding and shift
-    in0 = _mm_adds_epi16(in0, final_rounding);
-    in1 = _mm_adds_epi16(in1, final_rounding);
-    in2 = _mm_adds_epi16(in2, final_rounding);
-    in3 = _mm_adds_epi16(in3, final_rounding);
-    in4 = _mm_adds_epi16(in4, final_rounding);
-    in5 = _mm_adds_epi16(in5, final_rounding);
-    in6 = _mm_adds_epi16(in6, final_rounding);
-    in7 = _mm_adds_epi16(in7, final_rounding);
-    in8 = _mm_adds_epi16(in8, final_rounding);
-    in9 = _mm_adds_epi16(in9, final_rounding);
-    in10 = _mm_adds_epi16(in10, final_rounding);
-    in11 = _mm_adds_epi16(in11, final_rounding);
-    in12 = _mm_adds_epi16(in12, final_rounding);
-    in13 = _mm_adds_epi16(in13, final_rounding);
-    in14 = _mm_adds_epi16(in14, final_rounding);
-    in15 = _mm_adds_epi16(in15, final_rounding);
-
-    in0 = _mm_srai_epi16(in0, 6);
-    in1 = _mm_srai_epi16(in1, 6);
-    in2 = _mm_srai_epi16(in2, 6);
-    in3 = _mm_srai_epi16(in3, 6);
-    in4 = _mm_srai_epi16(in4, 6);
-    in5 = _mm_srai_epi16(in5, 6);
-    in6 = _mm_srai_epi16(in6, 6);
-    in7 = _mm_srai_epi16(in7, 6);
-    in8 = _mm_srai_epi16(in8, 6);
-    in9 = _mm_srai_epi16(in9, 6);
-    in10 = _mm_srai_epi16(in10, 6);
-    in11 = _mm_srai_epi16(in11, 6);
-    in12 = _mm_srai_epi16(in12, 6);
-    in13 = _mm_srai_epi16(in13, 6);
-    in14 = _mm_srai_epi16(in14, 6);
-    in15 = _mm_srai_epi16(in15, 6);
-
-    RECON_AND_STORE(dest, in0);
-    RECON_AND_STORE(dest, in1);
-    RECON_AND_STORE(dest, in2);
-    RECON_AND_STORE(dest, in3);
-    RECON_AND_STORE(dest, in4);
-    RECON_AND_STORE(dest, in5);
-    RECON_AND_STORE(dest, in6);
-    RECON_AND_STORE(dest, in7);
-    RECON_AND_STORE(dest, in8);
-    RECON_AND_STORE(dest, in9);
-    RECON_AND_STORE(dest, in10);
-    RECON_AND_STORE(dest, in11);
-    RECON_AND_STORE(dest, in12);
-    RECON_AND_STORE(dest, in13);
-    RECON_AND_STORE(dest, in14);
-    RECON_AND_STORE(dest, in15);
+    in[0] = _mm_adds_epi16(in[0], final_rounding);
+    in[1] = _mm_adds_epi16(in[1], final_rounding);
+    in[2] = _mm_adds_epi16(in[2], final_rounding);
+    in[3] = _mm_adds_epi16(in[3], final_rounding);
+    in[4] = _mm_adds_epi16(in[4], final_rounding);
+    in[5] = _mm_adds_epi16(in[5], final_rounding);
+    in[6] = _mm_adds_epi16(in[6], final_rounding);
+    in[7] = _mm_adds_epi16(in[7], final_rounding);
+    in[8] = _mm_adds_epi16(in[8], final_rounding);
+    in[9] = _mm_adds_epi16(in[9], final_rounding);
+    in[10] = _mm_adds_epi16(in[10], final_rounding);
+    in[11] = _mm_adds_epi16(in[11], final_rounding);
+    in[12] = _mm_adds_epi16(in[12], final_rounding);
+    in[13] = _mm_adds_epi16(in[13], final_rounding);
+    in[14] = _mm_adds_epi16(in[14], final_rounding);
+    in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+    in[0] = _mm_srai_epi16(in[0], 6);
+    in[1] = _mm_srai_epi16(in[1], 6);
+    in[2] = _mm_srai_epi16(in[2], 6);
+    in[3] = _mm_srai_epi16(in[3], 6);
+    in[4] = _mm_srai_epi16(in[4], 6);
+    in[5] = _mm_srai_epi16(in[5], 6);
+    in[6] = _mm_srai_epi16(in[6], 6);
+    in[7] = _mm_srai_epi16(in[7], 6);
+    in[8] = _mm_srai_epi16(in[8], 6);
+    in[9] = _mm_srai_epi16(in[9], 6);
+    in[10] = _mm_srai_epi16(in[10], 6);
+    in[11] = _mm_srai_epi16(in[11], 6);
+    in[12] = _mm_srai_epi16(in[12], 6);
+    in[13] = _mm_srai_epi16(in[13], 6);
+    in[14] = _mm_srai_epi16(in[14], 6);
+    in[15] = _mm_srai_epi16(in[15], 6);
+
+    RECON_AND_STORE(dest, in[0]);
+    RECON_AND_STORE(dest, in[1]);
+    RECON_AND_STORE(dest, in[2]);
+    RECON_AND_STORE(dest, in[3]);
+    RECON_AND_STORE(dest, in[4]);
+    RECON_AND_STORE(dest, in[5]);
+    RECON_AND_STORE(dest, in[6]);
+    RECON_AND_STORE(dest, in[7]);
+    RECON_AND_STORE(dest, in[8]);
+    RECON_AND_STORE(dest, in[9]);
+    RECON_AND_STORE(dest, in[10]);
+    RECON_AND_STORE(dest, in[11]);
+    RECON_AND_STORE(dest, in[12]);
+    RECON_AND_STORE(dest, in[13]);
+    RECON_AND_STORE(dest, in[14]);
+    RECON_AND_STORE(dest, in[15]);
 
     dest += 8 - (stride * 16);
   }
@@ -2785,28 +2814,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
-#define IDCT32_1D \
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
 /* Stage1 */ \
 { \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
   \
   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
@@ -2824,15 +3154,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage2 */ \
 { \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
   \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
   \
   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
@@ -2864,10 +3194,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage3 */ \
 { \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
   \
   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
@@ -2911,10 +3241,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage4 */ \
 { \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
   \
   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -3171,10 +3501,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[32];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3186,296 +3513,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
-    i32 = (i << 5);
-    if (i == 0) {
-      // First 1-D idct: first 8 rows
-      // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else if (i < 4) {
-      // First 1-D idct: next 24 zero-coeff rows
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    } else {
-      // Second 1-D idct
-      j = i - 4;
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
-
-    IDCT32_1D
+  int i;
+  // Load input data.
+  LOAD_DQCOEFF(in[0], input);
+  LOAD_DQCOEFF(in[8], input);
+  LOAD_DQCOEFF(in[16], input);
+  LOAD_DQCOEFF(in[24], input);
+  LOAD_DQCOEFF(in[1], input);
+  LOAD_DQCOEFF(in[9], input);
+  LOAD_DQCOEFF(in[17], input);
+  LOAD_DQCOEFF(in[25], input);
+  LOAD_DQCOEFF(in[2], input);
+  LOAD_DQCOEFF(in[10], input);
+  LOAD_DQCOEFF(in[18], input);
+  LOAD_DQCOEFF(in[26], input);
+  LOAD_DQCOEFF(in[3], input);
+  LOAD_DQCOEFF(in[11], input);
+  LOAD_DQCOEFF(in[19], input);
+  LOAD_DQCOEFF(in[27], input);
+
+  LOAD_DQCOEFF(in[4], input);
+  LOAD_DQCOEFF(in[12], input);
+  LOAD_DQCOEFF(in[20], input);
+  LOAD_DQCOEFF(in[28], input);
+  LOAD_DQCOEFF(in[5], input);
+  LOAD_DQCOEFF(in[13], input);
+  LOAD_DQCOEFF(in[21], input);
+  LOAD_DQCOEFF(in[29], input);
+  LOAD_DQCOEFF(in[6], input);
+  LOAD_DQCOEFF(in[14], input);
+  LOAD_DQCOEFF(in[22], input);
+  LOAD_DQCOEFF(in[30], input);
+  LOAD_DQCOEFF(in[7], input);
+  LOAD_DQCOEFF(in[15], input);
+  LOAD_DQCOEFF(in[23], input);
+  LOAD_DQCOEFF(in[31], input);
 
-    // final stage
-    if (i < 4) {
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+  array_transpose_8x8(in, in);
+  array_transpose_8x8(in+8, in+8);
+  array_transpose_8x8(in+16, in+16);
+  array_transpose_8x8(in+24, in+24);
+
+  IDCT32
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+i*8, in);
+      IDCT32_34
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
   }
-}
 
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
@@ -3530,10 +3786,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[128], zero_idx[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3546,66 +3799,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
-  __m128i zero_idx[16];
   int zero_flag[2];
 
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
+  for (i = 0; i < 4; i++) {
     i32 = (i << 5);
-    if (i < 4) {
       // First 1-D idct
       // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
+      LOAD_DQCOEFF(in[0], input);
+      LOAD_DQCOEFF(in[8], input);
+      LOAD_DQCOEFF(in[16], input);
+      LOAD_DQCOEFF(in[24], input);
+      LOAD_DQCOEFF(in[1], input);
+      LOAD_DQCOEFF(in[9], input);
+      LOAD_DQCOEFF(in[17], input);
+      LOAD_DQCOEFF(in[25], input);
+      LOAD_DQCOEFF(in[2], input);
+      LOAD_DQCOEFF(in[10], input);
+      LOAD_DQCOEFF(in[18], input);
+      LOAD_DQCOEFF(in[26], input);
+      LOAD_DQCOEFF(in[3], input);
+      LOAD_DQCOEFF(in[11], input);
+      LOAD_DQCOEFF(in[19], input);
+      LOAD_DQCOEFF(in[27], input);
+
+      LOAD_DQCOEFF(in[4], input);
+      LOAD_DQCOEFF(in[12], input);
+      LOAD_DQCOEFF(in[20], input);
+      LOAD_DQCOEFF(in[28], input);
+      LOAD_DQCOEFF(in[5], input);
+      LOAD_DQCOEFF(in[13], input);
+      LOAD_DQCOEFF(in[21], input);
+      LOAD_DQCOEFF(in[29], input);
+      LOAD_DQCOEFF(in[6], input);
+      LOAD_DQCOEFF(in[14], input);
+      LOAD_DQCOEFF(in[22], input);
+      LOAD_DQCOEFF(in[30], input);
+      LOAD_DQCOEFF(in[7], input);
+      LOAD_DQCOEFF(in[15], input);
+      LOAD_DQCOEFF(in[23], input);
+      LOAD_DQCOEFF(in[31], input);
 
       // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in0, in1);
-      zero_idx[1] = _mm_or_si128(in2, in3);
-      zero_idx[2] = _mm_or_si128(in4, in5);
-      zero_idx[3] = _mm_or_si128(in6, in7);
-      zero_idx[4] = _mm_or_si128(in8, in9);
-      zero_idx[5] = _mm_or_si128(in10, in11);
-      zero_idx[6] = _mm_or_si128(in12, in13);
-      zero_idx[7] = _mm_or_si128(in14, in15);
-      zero_idx[8] = _mm_or_si128(in16, in17);
-      zero_idx[9] = _mm_or_si128(in18, in19);
-      zero_idx[10] = _mm_or_si128(in20, in21);
-      zero_idx[11] = _mm_or_si128(in22, in23);
-      zero_idx[12] = _mm_or_si128(in24, in25);
-      zero_idx[13] = _mm_or_si128(in26, in27);
-      zero_idx[14] = _mm_or_si128(in28, in29);
-      zero_idx[15] = _mm_or_si128(in30, in31);
+      zero_idx[0] = _mm_or_si128(in[0], in[1]);
+      zero_idx[1] = _mm_or_si128(in[2], in[3]);
+      zero_idx[2] = _mm_or_si128(in[4], in[5]);
+      zero_idx[3] = _mm_or_si128(in[6], in[7]);
+      zero_idx[4] = _mm_or_si128(in[8], in[9]);
+      zero_idx[5] = _mm_or_si128(in[10], in[11]);
+      zero_idx[6] = _mm_or_si128(in[12], in[13]);
+      zero_idx[7] = _mm_or_si128(in[14], in[15]);
+      zero_idx[8] = _mm_or_si128(in[16], in[17]);
+      zero_idx[9] = _mm_or_si128(in[18], in[19]);
+      zero_idx[10] = _mm_or_si128(in[20], in[21]);
+      zero_idx[11] = _mm_or_si128(in[22], in[23]);
+      zero_idx[12] = _mm_or_si128(in[24], in[25]);
+      zero_idx[13] = _mm_or_si128(in[26], in[27]);
+      zero_idx[14] = _mm_or_si128(in[28], in[29]);
+      zero_idx[15] = _mm_or_si128(in[30], in[31]);
 
       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
@@ -3667,44 +3917,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       }
 
       // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else {
-      // Second 1-D idct
-      j = i - 4;
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+      array_transpose_8x8(in+16, in+16);
+      array_transpose_8x8(in+24, in+24);
 
-    IDCT32_1D
+      IDCT32
 
-    // final stage
-    if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
@@ -3738,146 +3957,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+    }
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Second 1-D idct
+      j = i << 3;
+
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+j, in);
+      array_transpose_8x8(col+j+32, in+8);
+      array_transpose_8x8(col+j+64, in+16);
+      array_transpose_8x8(col+j+96, in+24);
+
+      IDCT32
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
-  }
 }  //NOLINT
 
 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm
new file mode 100644
index 00000000000..2c1060710cc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm
@@ -0,0 +1,300 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the inverse transformation. Part
+; of the functions are originally derived from the ffmpeg project.
+; Note that the current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+pw_16:      times 8 dw 16
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+%endmacro
+
+TRANSFORM_COEFFS    6270, 15137
+TRANSFORM_COEFFS    3196, 16069
+TRANSFORM_COEFFS   13623,  9102
+
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro IDCT8_1D 0
+  SUM_SUB          0,    4,    9
+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
+  pmulhrsw        m0,  m12
+  pmulhrsw        m4,  m12
+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
+
+  SUM_SUB          1,    5,    9
+  SUM_SUB          7,    3,    9
+  SUM_SUB          0,    6,    9
+  SUM_SUB          4,    2,    9
+  SUM_SUB          3,    5,    9
+  pmulhrsw        m3,  m12
+  pmulhrsw        m5,  m12
+
+  SUM_SUB          0,    7,    9
+  SUM_SUB          4,    3,    9
+  SUM_SUB          2,    5,    9
+  SUM_SUB          6,    1,    9
+
+  SWAP             3,    6
+  SWAP             1,    4
+%endmacro
+
+; This macro handles 8 pixels per line
+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
+  paddw           m%1, m11
+  paddw           m%2, m11
+  psraw           m%1, 5
+  psraw           m%2, 5
+
+  movh            m%3, [outputq]
+  movh            m%4, [outputq + strideq]
+  punpcklbw       m%3, m%5
+  punpcklbw       m%4, m%5
+  paddw           m%3, m%1
+  paddw           m%4, m%2
+  packuswb        m%3, m%5
+  packuswb        m%4, m%5
+  movh               [outputq], m%3
+  movh     [outputq + strideq], m%4
+%endmacro
+
+INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
+  mova     m8, [pd_8192]
+  mova    m11, [pw_16]
+  mova    m12, [pw_11585x2]
+
+  lea      r3, [2 * strideq]
+
+  mova     m0, [inputq +   0]
+  mova     m1, [inputq +  16]
+  mova     m2, [inputq +  32]
+  mova     m3, [inputq +  48]
+  mova     m4, [inputq +  64]
+  mova     m5, [inputq +  80]
+  mova     m6, [inputq +  96]
+  mova     m7, [inputq + 112]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 00000000000..439c028f29d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+                    p256_0, q256_0;
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            p256_7 = _mm256_cvtepu8_epi16(p7);
+            p256_6 = _mm256_cvtepu8_epi16(p6);
+            p256_5 = _mm256_cvtepu8_epi16(p5);
+            p256_4 = _mm256_cvtepu8_epi16(p4);
+            p256_3 = _mm256_cvtepu8_epi16(p3);
+            p256_2 = _mm256_cvtepu8_epi16(p2);
+            p256_1 = _mm256_cvtepu8_epi16(p1);
+            p256_0 = _mm256_cvtepu8_epi16(p0);
+            q256_0 = _mm256_cvtepu8_epi16(q0);
+            q256_1 = _mm256_cvtepu8_epi16(q1);
+            q256_2 = _mm256_cvtepu8_epi16(q2);
+            q256_3 = _mm256_cvtepu8_epi16(q3);
+            q256_4 = _mm256_cvtepu8_epi16(q4);
+            q256_5 = _mm256_cvtepu8_epi16(q5);
+            q256_6 = _mm256_cvtepu8_epi16(q6);
+            q256_7 = _mm256_cvtepu8_epi16(q7);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index fa4dd9bcb8d..448ad5af7fc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <emmintrin.h>  /* SSE2 */
+#include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -17,20 +17,14 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
   q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
@@ -105,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -116,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi16(filter1, t1);
     filt = _mm_srai_epi16(filt, 1);
     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
@@ -375,32 +369,25 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                              const unsigned char *_blimit,
                                              const unsigned char *_limit,
                                              const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
-
-  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
 
-  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
-  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
 
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
 
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
   int i = 0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
@@ -413,16 +400,16 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
 
-  _mm_store_si128((__m128i *)ap[4], p4);
-  _mm_store_si128((__m128i *)ap[3], p3);
-  _mm_store_si128((__m128i *)ap[2], p2);
-  _mm_store_si128((__m128i *)ap[1], p1);
-  _mm_store_si128((__m128i *)ap[0], p0);
-  _mm_store_si128((__m128i *)aq[4], q4);
-  _mm_store_si128((__m128i *)aq[3], q3);
-  _mm_store_si128((__m128i *)aq[2], q2);
-  _mm_store_si128((__m128i *)aq[1], q1);
-  _mm_store_si128((__m128i *)aq[0], q0);
+  _mm_store_si128((__m128i *)&ap[4 * 16], p4);
+  _mm_store_si128((__m128i *)&ap[3 * 16], p3);
+  _mm_store_si128((__m128i *)&ap[2 * 16], p2);
+  _mm_store_si128((__m128i *)&ap[1 * 16], p1);
+  _mm_store_si128((__m128i *)&ap[0 * 16], p0);
+  _mm_store_si128((__m128i *)&aq[4 * 16], q4);
+  _mm_store_si128((__m128i *)&aq[3 * 16], q3);
+  _mm_store_si128((__m128i *)&aq[2 * 16], q2);
+  _mm_store_si128((__m128i *)&aq[1 * 16], q1);
+  _mm_store_si128((__m128i *)&aq[0 * 16], q0);
 
 
   {
@@ -486,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -500,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter1 = _mm_or_si128(filter1, work_a);
     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -508,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter2 = _mm_or_si128(filter2, work_a);
     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -546,8 +533,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p5)),
                            _mm_or_si128(_mm_subs_epu8(q5, q0),
                                         _mm_subs_epu8(q0, q5)));
-      _mm_store_si128((__m128i *)ap[5], p5);
-      _mm_store_si128((__m128i *)aq[5], q5);
+      _mm_store_si128((__m128i *)&ap[5 * 16], p5);
+      _mm_store_si128((__m128i *)&aq[5 * 16], q5);
       flat2 = _mm_max_epu8(work, flat2);
       p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
       q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
@@ -555,8 +542,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p6)),
                            _mm_or_si128(_mm_subs_epu8(q6, q0),
                                         _mm_subs_epu8(q0, q6)));
-      _mm_store_si128((__m128i *)ap[6], p6);
-      _mm_store_si128((__m128i *)aq[6], q6);
+      _mm_store_si128((__m128i *)&ap[6 * 16], p6);
+      _mm_store_si128((__m128i *)&aq[6 * 16], q6);
       flat2 = _mm_max_epu8(work, flat2);
 
       p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
@@ -565,8 +552,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p7)),
                            _mm_or_si128(_mm_subs_epu8(q7, q0),
                                         _mm_subs_epu8(q0, q7)));
-      _mm_store_si128((__m128i *)ap[7], p7);
-      _mm_store_si128((__m128i *)aq[7], q7);
+      _mm_store_si128((__m128i *)&ap[7 * 16], p7);
+      _mm_store_si128((__m128i *)&aq[7 * 16], q7);
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
@@ -586,22 +573,38 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
         __m128i a, b, c;
 
         unsigned int off = i * 8;
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
+                               zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
+                               zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
+                               zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
+                               zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
+                               zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
+                               zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
+                               zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
+                               zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
+                               zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
+                               zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
+                               zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
+                               zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
+                               zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
+                               zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
+                               zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
+                               zero);
 
         c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
         c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
@@ -610,117 +613,117 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
         a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
         a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
 
-        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q1, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q2, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q3, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(q4, c);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
         a = _mm_add_epi16(q5, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q6, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         temp_flat2 = _mm_srli_si128(temp_flat2, 8);
@@ -730,51 +733,51 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    work_a = _mm_load_si128((__m128i *)ap[2]);
-    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
-    _mm_store_si128((__m128i *)flat_op[2], p2);
+    _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
 
-    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
     work_a = _mm_andnot_si128(flat, ps1);
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
-    _mm_store_si128((__m128i *)flat_op[1], p1);
+    _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
 
-    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat_op[0]);
     work_a = _mm_andnot_si128(flat, ps0);
     p0 = _mm_and_si128(flat, p0);
     p0 = _mm_or_si128(work_a, p0);
-    _mm_store_si128((__m128i *)flat_op[0], p0);
+    _mm_store_si128((__m128i *)&flat_op[0], p0);
 
-    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
     work_a = _mm_andnot_si128(flat, qs0);
     q0 = _mm_and_si128(flat, q0);
     q0 = _mm_or_si128(work_a, q0);
-    _mm_store_si128((__m128i *)flat_oq[0], q0);
+    _mm_store_si128((__m128i *)&flat_oq[0], q0);
 
-    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
     work_a = _mm_andnot_si128(flat, qs1);
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
-    _mm_store_si128((__m128i *)flat_oq[1], q1);
+    _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
 
-    work_a = _mm_load_si128((__m128i *)aq[2]);
-    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
     q2 = _mm_or_si128(work_a, q2);
-    _mm_store_si128((__m128i *)flat_oq[2], q2);
+    _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
 
     // write out op6 - op3
     {
       unsigned char *dst = (s - 7 * p);
       for (i = 6; i > 2; i--) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)ap[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -783,43 +786,43 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
       }
     }
 
-    work_a = _mm_load_si128((__m128i *)flat_op[2]);
-    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p2 = _mm_and_si128(flat2, p2);
     p2 = _mm_or_si128(work_a, p2);
     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[1]);
-    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
+    p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p1 = _mm_and_si128(flat2, p1);
     p1 = _mm_or_si128(work_a, p1);
     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[0]);
-    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p0 = _mm_and_si128(flat2, p0);
     p0 = _mm_or_si128(work_a, p0);
     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
-    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q0 = _mm_and_si128(flat2, q0);
     q0 = _mm_or_si128(work_a, q0);
     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
-    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
+    q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q1 = _mm_and_si128(flat2, q1);
     q1 = _mm_or_si128(work_a, q1);
     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
-    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q2 = _mm_and_si128(flat2, q2);
     q2 = _mm_or_si128(work_a, q2);
@@ -830,8 +833,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
       unsigned char *dst = (s + 3 * p);
       for (i = 3; i < 7; i++) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)aq[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -842,52 +845,275 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   }
 }
 
-void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
-                                       int p,
-                                       const unsigned char *_blimit,
-                                       const unsigned char *_limit,
-                                       const unsigned char *_thresh,
-                                       int count) {
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh, int count) {
   if (count == 1)
     mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
   else
     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }
 
-void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
-                                            int p,
-                                            const unsigned char *_blimit,
-                                            const unsigned char *_limit,
-                                            const unsigned char *_thresh,
-                                            int count) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  __m128i mask, hev, flat;
+void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh, int count) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
   const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
   (void)count;
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    // filter_mask and hev_mask
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                            _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                            _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                            _mm_subs_epu8(p1q1, q1p1));
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                                     _mm_subs_epu8(q1p1, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                                     _mm_subs_epu8(q2p2, q3p3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+
+    flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                                     _mm_subs_epu8(q0p0, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                                     _mm_subs_epu8(q0p0, q3p3)));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+    }
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 11);
+    filter1 = _mm_packs_epi16(filter1, filter1);
+
+    // Filter2 >> 3
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 11);
+    filter2 = _mm_packs_epi16(filter2, zero);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    filt = _mm_unpacklo_epi8(zero, filt);
+    filt = _mm_srai_epi16(filt, 9);
+    filt = _mm_packs_epi16(filt, zero);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+                                    const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
@@ -901,6 +1127,8 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
                                     _mm_subs_epu8(q1, p1));
     __m128i work;
+
+    // filter_mask and hev_mask
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -926,6 +1154,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
 
+    // flat_mask4
     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
                                      _mm_subs_epu8(p0, p2)),
                          _mm_or_si128(_mm_subs_epu8(q2, q0),
@@ -943,7 +1172,9 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   {
     const __m128i four = _mm_set1_epi16(4);
     unsigned char *src = s;
-    {
+    int i = 0;
+
+    do {
       __m128i workp_a, workp_b, workp_shft;
       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
@@ -958,38 +1189,40 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
+      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
+      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
+      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
-    }
+
+      src += 8;
+    } while (++i < 2);
   }
   // lp filter
   {
@@ -1001,13 +1234,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
                                       t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
                                       t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
                                       t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
                                       t80);
     __m128i filt;
     __m128i work_a;
@@ -1018,27 +1251,27 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1049,47 +1282,185 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_andnot_si128(hev, filt);
 
     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
     work_a = _mm_andnot_si128(flat, work_a);
     q0 = _mm_and_si128(flat, q0);
     q0 = _mm_or_si128(work_a, q0);
 
     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
     work_a = _mm_andnot_si128(flat, work_a);
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
     q2 = _mm_or_si128(work_a, q2);
 
     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
     work_a = _mm_andnot_si128(flat, work_a);
     p0 = _mm_and_si128(flat, p0);
     p0 = _mm_or_si128(work_a, p0);
 
     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
     work_a = _mm_andnot_si128(flat, work_a);
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    p2 = _mm_load_si128((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, hev, flat;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  // filter_mask and hev_mask
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
   }
 }
 
@@ -1098,7 +1469,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  /* Read in 16 lines */
+  // Read in 16 lines
   x0 = _mm_loadl_epi64((__m128i *)in0);
   x8 = _mm_loadl_epi64((__m128i *)in1);
   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
@@ -1136,7 +1507,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store first 4-line result */
+  // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1152,7 +1523,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store second 4-line result */
+  // Store second 4-line result
   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1222,61 +1593,124 @@ static INLINE void transpose(unsigned char *src[], int in_p,
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
-                                          int p,
-                                          const unsigned char *blimit,
-                                          const unsigned char *limit,
-                                          const unsigned char *thresh,
-                                          int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
   unsigned char *src[2];
   unsigned char *dst[2];
 
-  (void)count;
-  /* Transpose 16x16 */
-  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
-  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
-
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, 1);
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
-  dst[0] = s - 5;
-  dst[1] = s - 5 + p * 8;
+  // Loop filtering
+  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
 
-  /* Transpose 16x8 */
+  // Transpose back
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                     int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[4];
-  unsigned char *dst[4];
+void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *blimit,
+                             const unsigned char *limit,
+                             const unsigned char *thresh, int count) {
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
 
+  // Transpose 8x8
+  src[0] = s - 4;
   dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 16;
 
-  src[0] = s - 8;
-  src[1] = s - 8 + 8;
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
 
-  /* Transpose 16x16 */
-  transpose(src, p, dst, 16, 2);
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  unsigned char *src[2];
+  unsigned char *dst[2];
 
-  /* Loop filtering */
-  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                    thresh, 1);
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
+  // Loop filtering
+  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
   src[0] = t_dst;
-  src[1] = t_dst + 8 * 16;
+  src[1] = t_dst + 8;
 
-  dst[0] = s - 8;
-  dst[1] = s - 8 + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
 
+  // Transpose back
   transpose(src, 16, dst, p, 2);
 }
+
+void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+                              const unsigned char *blimit,
+                              const unsigned char *limit,
+                              const unsigned char *thresh) {
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
index 4ebb51b7727..91055b9f9d4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
-;void vp9_loop_filter_horizontal_edge_mmx
+;void vp9_lpf_horizontal_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
@@ -21,8 +21,8 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_mmx):
+global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
+sym(vp9_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -224,7 +224,7 @@ sym(vp9_loop_filter_horizontal_edge_mmx):
     ret
 
 
-;void vp9_loop_filter_vertical_edge_mmx
+;void vp9_lpf_vertical_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
@@ -233,8 +233,8 @@ sym(vp9_loop_filter_horizontal_edge_mmx):
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_vertical_edge_mmx):
+global sym(vp9_lpf_vertical_4_mmx) PRIVATE
+sym(vp9_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -527,7 +527,7 @@ sym(vp9_loop_filter_vertical_edge_mmx):
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_postproc_x86.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_postproc_x86.h
index 8870215a27b..cab9d34f259 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_postproc_x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_postproc_x86.h
@@ -12,6 +12,10 @@
 #ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_
 #define VP9_COMMON_X86_VP9_POSTPROC_X86_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Note:
  *
  * This platform is commonly built for runtime CPU detection. If you modify
@@ -61,4 +65,8 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
 #endif
 #endif
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_COMMON_X86_VP9_POSTPROC_X86_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
new file mode 100644
index 00000000000..b84db970eba
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,544 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#if defined(__clang__)
+# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+      (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# else  // clang > 3.3, and not 5.0 on macosx.
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // clang <= 3.3
+#elif defined(__GNUC__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+# else  // gcc > 4.7
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // gcc <= 4.6
+#else  // !(gcc || clang)
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pixels_per_line,
+                                  unsigned char *output_ptr,
+                                  unsigned int  output_pitch,
+                                  unsigned int  output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i-=2) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
+    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line-3)), 1);
+
+    // filter the source buffer
+    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
+    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line+5)), 1);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+                                           srcRegFilt32b2_1);
+
+    src_ptr+=src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr,
+    _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr+=dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+  }
+}
+
+void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pitch,
+                                  unsigned char *output_ptr,
+                                  unsigned int out_pitch,
+                                  unsigned int output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr)));
+  srcReg32b2 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+  srcReg32b3 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+  srcReg32b4 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+  srcReg32b5 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+  srcReg32b6 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+  srcReg32b7 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+               _mm256_castsi256_si128(srcReg32b2), 1);
+  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+               _mm256_castsi256_si128(srcReg32b3), 1);
+  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+               _mm256_castsi256_si128(srcReg32b4), 1);
+  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+               _mm256_castsi256_si128(srcReg32b5), 1);
+  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+               _mm256_castsi256_si128(srcReg32b6), 1);
+  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+               _mm256_castsi256_si128(srcReg32b7), 1);
+
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  // save
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+  for (i = output_height; i > 1; i-=2) {
+     // load the last 2 loads of 16 bytes and have every two
+     // consecutive loads in the same 256 bit register
+     srcReg32b8 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+     _mm256_castsi256_si128(srcReg32b8), 1);
+     srcReg32b9 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+     _mm256_castsi256_si128(srcReg32b9), 1);
+
+     // merge every two consecutive registers
+     // save
+     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
+
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
+
+
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+     // shift by 7 bit each 16 bit
+     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+     // shrink to 8 bit each 16 bits, the first lane contain the first
+     // convolve result and the second lane contain the second convolve
+     // result
+     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+     src_ptr+=src_stride;
+
+     // save 16 bytes
+     _mm_store_si128((__m128i*)output_ptr,
+     _mm256_castsi256_si128(srcReg32b1));
+
+     // save the next 16 bits
+     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+     _mm256_extractf128_si256(srcReg32b1, 1));
+
+     output_ptr+=dst_stride;
+
+     // save part of the registers for next strides
+     srcReg32b10 = srcReg32b11;
+     srcReg32b1 = srcReg32b3;
+     srcReg32b11 = srcReg32b2;
+     srcReg32b3 = srcReg32b5;
+     srcReg32b2 = srcReg32b4;
+     srcReg32b5 = srcReg32b7;
+     srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the last 2 results together
+    srcRegFilt4 = _mm_unpacklo_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 = _mm_unpackhi_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                  _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 00000000000..cf28d8d2b7c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,490 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 8 bytes
+    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 16 bytes
+    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+    // load the next 16 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // load the next 16 bytes in stride of two/three src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+    // merge the result together
+    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // load the next 16 bytes in stride of four/five src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 7a5cca056a3..634fa77462f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -11,17 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-
 %macro VERTx4 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -81,11 +70,14 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    movdqa      xmm1, xmm2
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
@@ -166,10 +158,13 @@
     pmaddubsw   xmm6, k6k7
 
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
@@ -251,10 +246,13 @@
     pmaddubsw   xmm6, k6k7
 
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 %if %1
@@ -538,14 +536,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movdqa      %2,   %1
     pshufb      %1,   [GLOBAL(shuf_t0t1)]
     pshufb      %2,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   %1,   xmm6
-    pmaddubsw   %2,   xmm7
+    pmaddubsw   %1,   k0k1k4k5
+    pmaddubsw   %2,   k2k3k6k7
 
-    paddsw      %1,   %2
-    movdqa      %2,   %1
+    movdqa      xmm4, %1
+    movdqa      xmm5, %2
+    psrldq      %1,   8
     psrldq      %2,   8
-    paddsw      %1,   %2
-    paddsw      %1,   xmm5
+    movdqa      xmm6, xmm5
+
+    paddsw      xmm4, %2
+    pmaxsw      xmm5, %1
+    pminsw      %1, xmm6
+    paddsw      %1, xmm4
+    paddsw      %1, xmm5
+
+    paddsw      %1,   krd
     psraw       %1,   7
     packuswb    %1,   %1
 %endm
@@ -565,6 +571,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
     pshufd      xmm5, xmm5, 0               ;rounding
 
+    movdqa      k0k1k4k5, xmm6
+    movdqa      k2k3k6k7, xmm7
+    movdqa      krd, xmm5
+
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
@@ -631,9 +641,13 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pmaddubsw   %3,   k4k5
     pmaddubsw   %4,   k6k7
 
-    paddsw      %1,   %2
     paddsw      %1,   %4
+    movdqa      %4,   %2
+    pmaxsw      %2,   %3
+    pminsw      %3,   %4
     paddsw      %1,   %3
+    paddsw      %1,   %2
+
     paddsw      %1,   krd
     psraw       %1,   7
     packuswb    %1,   %1
@@ -779,12 +793,19 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pmaddubsw   xmm6,   k4k5
     pmaddubsw   xmm7,   k6k7
 
-    paddsw      xmm0,   xmm1
     paddsw      xmm0,   xmm3
+    movdqa      xmm3,   xmm1
+    pmaxsw      xmm1,   xmm2
+    pminsw      xmm2,   xmm3
     paddsw      xmm0,   xmm2
-    paddsw      xmm4,   xmm5
+    paddsw      xmm0,   xmm1
+
     paddsw      xmm4,   xmm7
+    movdqa      xmm7,   xmm5
+    pmaxsw      xmm5,   xmm6
+    pminsw      xmm6,   xmm7
     paddsw      xmm4,   xmm6
+    paddsw      xmm4,   xmm5
 
     paddsw      xmm0,   krd
     paddsw      xmm4,   krd
@@ -826,8 +847,16 @@ sym(vp9_filter_block1d4_h8_ssse3):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 0
 
+    add rsp, 16 * 3
+    pop rsp
     ; begin epilog
     pop rdi
     pop rsi
@@ -932,8 +961,16 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 1
 
+    add rsp, 16 * 3
+    pop rsp
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
new file mode 100644
index 00000000000..d94ccf2e9b7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -0,0 +1,448 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
+sym(vp9_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
+sym(vp9_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
+sym(vp9_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
+sym(vp9_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
+sym(vp9_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
+sym(vp9_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
new file mode 100644
index 00000000000..b5e18fe6d4a
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
author	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-08 14:30:41 +0200
committer	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-12 13:49:54 +0200
commit	ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree	498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp9/common
parent	4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)