1 files changed, 446 insertions, 181 deletions
diff --git a/chromium/third_party/ffmpeg/libavcodec/vp9.c b/chromium/third_party/ffmpeg/libavcodec/vp9.c
index a52924c7c90..000e9918a52 100644
--- a/chromium/third_party/ffmpeg/libavcodec/vp9.c
+++ b/chromium/third_party/ffmpeg/libavcodec/vp9.c
@@ -210,19 +210,31 @@ typedef struct VP9Context {
     enum CompPredMode comppredmode;
 
     // contextual (left/above) cache
-    uint8_t left_partition_ctx[8], *above_partition_ctx;
-    uint8_t left_mode_ctx[16], *above_mode_ctx;
+    DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
+    DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
+    DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
+    DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
+    DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
+    uint8_t *above_partition_ctx;
+    uint8_t *above_mode_ctx;
     // FIXME maybe merge some of the below in a flags field?
-    uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
-    uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
-    uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
-    uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
-    uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
-    uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
-    uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
-    uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
-    uint8_t left_filter_ctx[8], *above_filter_ctx;
-    VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
+    uint8_t *above_y_nnz_ctx;
+    uint8_t *above_uv_nnz_ctx[2];
+    uint8_t *above_skip_ctx; // 1bit
+    uint8_t *above_txfm_ctx; // 2bit
+    uint8_t *above_segpred_ctx; // 1bit
+    uint8_t *above_intra_ctx; // 1bit
+    uint8_t *above_comp_ctx; // 1bit
+    uint8_t *above_ref_ctx; // 2bit
+    uint8_t *above_filter_ctx;
+    VP56mv (*above_mv_ctx)[2];
 
     // whole-frame cache
     uint8_t *intra_pred_data[3];
@@ -230,9 +242,10 @@ typedef struct VP9Context {
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 
     // block reconstruction intermediates
+    int block_alloc_using_2pass;
     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
-    VP56mv min_mv, max_mv;
+    struct { int x, y; } min_mv, max_mv;
     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 } VP9Context;
@@ -264,7 +277,8 @@ static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 
     // retain segmentation map if it doesn't update
-    if (s->segmentation.enabled && !s->segmentation.update_map) {
+    if (s->segmentation.enabled && !s->segmentation.update_map &&
+        !s->intraonly && !s->keyframe) {
         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
     }
 
@@ -301,7 +315,7 @@ static int update_size(AVCodecContext *ctx, int w, int h)
 
     av_assert0(w > 0 && h > 0);
 
-    if (s->above_partition_ctx && w == ctx->width && h == ctx->height)
+    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
         return 0;
 
     ctx->width  = w;
@@ -312,32 +326,46 @@ static int update_size(AVCodecContext *ctx, int w, int h)
     s->rows     = (h + 7) >> 3;
 
 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
-    av_freep(&s->above_partition_ctx);
+    av_freep(&s->intra_pred_data[0]);
     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
+    assign(s->intra_pred_data[0],  uint8_t *,             64);
+    assign(s->intra_pred_data[1],  uint8_t *,             32);
+    assign(s->intra_pred_data[2],  uint8_t *,             32);
+    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
+    assign(s->above_mode_ctx,      uint8_t *,             16);
+    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
     assign(s->above_partition_ctx, uint8_t *,              8);
     assign(s->above_skip_ctx,      uint8_t *,              8);
     assign(s->above_txfm_ctx,      uint8_t *,              8);
-    assign(s->above_mode_ctx,      uint8_t *,             16);
-    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
-    assign(s->intra_pred_data[0],  uint8_t *,             64);
-    assign(s->intra_pred_data[1],  uint8_t *,             32);
-    assign(s->intra_pred_data[2],  uint8_t *,             32);
     assign(s->above_segpred_ctx,   uint8_t *,              8);
     assign(s->above_intra_ctx,     uint8_t *,              8);
     assign(s->above_comp_ctx,      uint8_t *,              8);
     assign(s->above_ref_ctx,       uint8_t *,              8);
     assign(s->above_filter_ctx,    uint8_t *,              8);
     assign(s->lflvl,               struct VP9Filter *,     1);
-    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 #undef assign
 
+    // these will be re-allocated a little later
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
+
+    return 0;
+}
+
+static int update_block_buffers(AVCodecContext *ctx)
+{
+    VP9Context *s = ctx->priv_data;
+
+    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
+        return 0;
+
     av_free(s->b_base);
     av_free(s->block_base);
-    if (ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode) {
+    if (s->uses_2pass) {
         int sbs = s->sb_cols * s->sb_rows;
 
         s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
@@ -360,6 +388,7 @@ static int update_size(AVCodecContext *ctx, int w, int h)
         s->uveob_base[0] = s->eob_base + 256;
         s->uveob_base[1] = s->uveob_base[0] + 64;
     }
+    s->block_alloc_using_2pass = s->uses_2pass;
 
     return 0;
 }
@@ -466,7 +495,6 @@ static int decode_frame_header(AVCodecContext *ctx,
     last_invisible    = s->invisible;
     s->invisible      = !get_bits1(&s->gb);
     s->errorres       = get_bits1(&s->gb);
-    // FIXME disable this upon resolution change
     s->use_last_frame_mvs = !s->errorres && !last_invisible;
     if (s->keyframe) {
         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
@@ -525,6 +553,11 @@ static int decode_frame_header(AVCodecContext *ctx,
                 w = get_bits(&s->gb, 16) + 1;
                 h = get_bits(&s->gb, 16) + 1;
             }
+            // Note that in this code, "CUR_FRAME" is actually before we
+            // have formally allocated a frame, and thus actually represents
+            // the _last_ frame
+            s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
+                                     s->frames[CUR_FRAME].tf.f->height == h;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
             s->highprecisionmvs = get_bits1(&s->gb);
@@ -588,10 +621,19 @@ static int decode_frame_header(AVCodecContext *ctx,
             for (i = 0; i < 7; i++)
                 s->prob.seg[i] = get_bits1(&s->gb) ?
                                  get_bits(&s->gb, 8) : 255;
-            if ((s->segmentation.temporal = get_bits1(&s->gb)))
+            if ((s->segmentation.temporal = get_bits1(&s->gb))) {
                 for (i = 0; i < 3; i++)
                     s->prob.segpred[i] = get_bits1(&s->gb) ?
                                          get_bits(&s->gb, 8) : 255;
+            }
+        }
+        if ((!s->segmentation.update_map || s->segmentation.temporal) &&
+            (w != s->frames[CUR_FRAME].tf.f->width ||
+             h != s->frames[CUR_FRAME].tf.f->height)) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
+                   s->segmentation.temporal, s->segmentation.update_map);
+            return AVERROR_INVALIDDATA;
         }
 
         if (get_bits1(&s->gb)) {
@@ -1159,7 +1201,7 @@ static void fill_mv(VP9Context *s,
     VP9Block *b = s->b;
 
     if (mode == ZEROMV) {
-        memset(mv, 0, sizeof(*mv) * 2);
+        AV_ZERO64(mv);
     } else {
         int hp;
 
@@ -1226,6 +1268,52 @@ static void fill_mv(VP9Context *s,
     }
 }
 
+static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
+                                       ptrdiff_t stride, int v)
+{
+    switch (w) {
+    case 1:
+        do {
+            *ptr = v;
+            ptr += stride;
+        } while (--h);
+        break;
+    case 2: {
+        int v16 = v * 0x0101;
+        do {
+            AV_WN16A(ptr, v16);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 4: {
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr, v32);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 8: {
+#if HAVE_FAST_64BIT
+        uint64_t v64 = v * 0x0101010101010101ULL;
+        do {
+            AV_WN64A(ptr, v64);
+            ptr += stride;
+        } while (--h);
+#else
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr,     v32);
+            AV_WN32A(ptr + 4, v32);
+            ptr += stride;
+        } while (--h);
+#endif
+        break;
+    }
+    }
+}
+
 static void decode_mode(AVCodecContext *ctx)
 {
     static const uint8_t left_ctx[N_BS_SIZES] = {
@@ -1245,12 +1333,12 @@ static void decode_mode(AVCodecContext *ctx)
     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
+    int vref, filter_id;
 
     if (!s->segmentation.enabled) {
         b->seg_id = 0;
     } else if (s->keyframe || s->intraonly) {
-        b->seg_id = s->segmentation.update_map ?
-            vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg) : 0;
+        b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
     } else if (!s->segmentation.update_map ||
                (s->segmentation.temporal &&
                 vp56_rac_get_prob_branchy(&s->c,
@@ -1276,11 +1364,10 @@ static void decode_mode(AVCodecContext *ctx)
         memset(&s->above_segpred_ctx[col], 0, w4);
         memset(&s->left_segpred_ctx[row7], 0, h4);
     }
-    if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
-        uint8_t *segmap = s->frames[CUR_FRAME].segmentation_map;
-
-        for (y = 0; y < h4; y++)
-            memset(&segmap[(y + row) * 8 * s->sb_cols + col], b->seg_id, w4);
+    if (s->segmentation.enabled &&
+        (s->segmentation.update_map || s->keyframe || s->intraonly)) {
+        setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
+                  w4, h4, 8 * s->sb_cols, b->seg_id);
     }
 
     b->skip = s->segmentation.enabled &&
@@ -1738,9 +1825,10 @@ static void decode_mode(AVCodecContext *ctx)
                 c = 3;
             }
 
-            b->filter = vp8_rac_get_tree(&s->c, vp9_filter_tree,
+            filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
                                          s->prob.p.filter[c]);
-            s->counts.filter[c][b->filter]++;
+            s->counts.filter[c][filter_id]++;
+            b->filter = vp9_filter_lut[filter_id];
         } else {
             b->filter = s->filtermode;
         }
@@ -1797,27 +1885,80 @@ static void decode_mode(AVCodecContext *ctx)
             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
         }
+
+        vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
+    }
+
+#if HAVE_FAST_64BIT
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                                    break; \
+    case 2:  AV_WN16A(&var, val *             0x0101);     break; \
+    case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
+    case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
+    case 16: { \
+        uint64_t v64 = val * 0x0101010101010101ULL; \
+        AV_WN64A(              &var,     v64); \
+        AV_WN64A(&((uint8_t *) &var)[8], v64); \
+        break; \
+    } \
+    }
+#else
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                         break; \
+    case 2:  AV_WN16A(&var, val *     0x0101);  break; \
+    case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
+    case 8: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,     v32); \
+        AV_WN32A(&((uint8_t *) &var)[4], v32); \
+        break; \
+    } \
+    case 16: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,      v32); \
+        AV_WN32A(&((uint8_t *) &var)[4],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[8],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[12], v32); \
+        break; \
+    } \
+    }
+#endif
+
+    switch (bwh_tab[1][b->bs][0]) {
+#define SET_CTXS(dir, off, n) \
+    do { \
+        SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
+        SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
+        SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
+        if (!s->keyframe && !s->intraonly) { \
+            SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
+            SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
+            SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
+            if (!b->intra) { \
+                SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
+                if (s->filtermode == FILTER_SWITCHABLE) { \
+                    SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
+                } \
+            } \
+        } \
+    } while (0)
+    case 1: SET_CTXS(above, col, 1); break;
+    case 2: SET_CTXS(above, col, 2); break;
+    case 4: SET_CTXS(above, col, 4); break;
+    case 8: SET_CTXS(above, col, 8); break;
     }
+    switch (bwh_tab[1][b->bs][1]) {
+    case 1: SET_CTXS(left, row7, 1); break;
+    case 2: SET_CTXS(left, row7, 2); break;
+    case 4: SET_CTXS(left, row7, 4); break;
+    case 8: SET_CTXS(left, row7, 8); break;
+    }
+#undef SPLAT_CTX
+#undef SET_CTXS
 
-    // FIXME this can probably be optimized
-    memset(&s->above_skip_ctx[col], b->skip, w4);
-    memset(&s->left_skip_ctx[row7], b->skip, h4);
-    memset(&s->above_txfm_ctx[col], b->tx, w4);
-    memset(&s->left_txfm_ctx[row7], b->tx, h4);
-    memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
-    memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
     if (!s->keyframe && !s->intraonly) {
-        memset(&s->above_intra_ctx[col], b->intra, w4);
-        memset(&s->left_intra_ctx[row7], b->intra, h4);
-        memset(&s->above_comp_ctx[col], b->comp, w4);
-        memset(&s->left_comp_ctx[row7], b->comp, h4);
-        memset(&s->above_mode_ctx[col], b->mode[3], w4);
-        memset(&s->left_mode_ctx[row7], b->mode[3], h4);
-        if (s->filtermode == FILTER_SWITCHABLE && !b->intra ) {
-            memset(&s->above_filter_ctx[col], b->filter, w4);
-            memset(&s->left_filter_ctx[row7], b->filter, h4);
-            b->filter = vp9_filter_lut[b->filter];
-        }
         if (b->bs > BS_8x8) {
             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
 
@@ -1841,14 +1982,6 @@ static void decode_mode(AVCodecContext *ctx)
                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
             }
         }
-
-        if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
-                         // as a direct check in above branches
-            int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
-
-            memset(&s->above_ref_ctx[col], vref, w4);
-            memset(&s->left_ref_ctx[row7], vref, h4);
-        }
     }
 
     // FIXME kinda ugly
@@ -1878,12 +2011,13 @@ static void decode_mode(AVCodecContext *ctx)
     }
 }
 
-// FIXME remove tx argument, and merge cnt/eob arguments?
-static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
-                           enum TxfmMode tx, unsigned (*cnt)[6][3],
-                           unsigned (*eob)[6][2], uint8_t (*p)[6][11],
-                           int nnz, const int16_t *scan, const int16_t (*nb)[2],
-                           const int16_t *band_counts, const int16_t *qmul)
+// FIXME merge cnt/eob arguments?
+static av_always_inline int
+decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                        int is_tx32x32, unsigned (*cnt)[6][3],
+                        unsigned (*eob)[6][2], uint8_t (*p)[6][11],
+                        int nnz, const int16_t *scan, const int16_t (*nb)[2],
+                        const int16_t *band_counts, const int16_t *qmul)
 {
     int i = 0, band = 0, band_left = band_counts[band];
     uint8_t *tp = p[0][nnz];
@@ -1975,7 +2109,7 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
         }
         if (!--band_left)
             band_left = band_counts[++band];
-        if (tx == TX_32X32) // FIXME slow
+        if (is_tx32x32)
             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
         else
             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
@@ -1986,6 +2120,26 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
     return i;
 }
 
+static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                           unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                           uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                           const int16_t (*nb)[2], const int16_t *band_counts,
+                           const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                             unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                             uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                             const int16_t (*nb)[2], const int16_t *band_counts,
+                             const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
 static void decode_coeffs(AVCodecContext *ctx)
 {
     VP9Context *s = ctx->priv_data;
@@ -1997,8 +2151,7 @@ static void decode_coeffs(AVCodecContext *ctx)
     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
     int end_x = FFMIN(2 * (s->cols - col), w4);
     int end_y = FFMIN(2 * (s->rows - row), h4);
-    int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
-    int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
+    int n, pl, x, y, res;
     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
     int tx = 4 * s->lossless + b->tx;
     const int16_t * const *yscans = vp9_scans[tx];
@@ -2016,37 +2169,101 @@ static void decode_coeffs(AVCodecContext *ctx)
     const int16_t *y_band_counts = band_counts[b->tx];
     const int16_t *uv_band_counts = band_counts[b->uvtx];
 
-    /* y tokens */
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            for (x = 1; x < step1d; x++)
-                l[y] |= l[y + x];
-        for (x = 0; x < end_x; x += step1d)
-            for (y = 1; y < step1d; y++)
-                a[x] |= a[x + y];
+#define MERGE(la, end, step, rd) \
+    for (n = 0; n < end; n += step) \
+        la[n] = !!rd(&la[n])
+#define MERGE_CTX(step, rd) \
+    do { \
+        MERGE(l, end_y, step, rd); \
+        MERGE(a, end_x, step, rd); \
+    } while (0)
+
+#define DECODE_Y_COEF_LOOP(step, mode_index, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
+            res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
+                                     c, e, p, a[x] + l[y], yscans[txtp], \
+                                     ynbs[txtp], y_band_counts, qmul[0]); \
+            a[x] = l[y] = !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->eob[n], res); \
+            } else { \
+                s->eob[n] = res; \
+            } \
+        } \
     }
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        for (x = 0; x < end_x; x += step1d, n += step) {
-            enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
-                                                             b->bs > BS_8x8 ?
-                                                             n : 0]];
-            int nnz = a[x] + l[y];
-            res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
-                                  b->tx, c, e, p, nnz, yscans[txtp],
-                                  ynbs[txtp], y_band_counts, qmul[0]);
-            a[x] = l[y] = !!res;
-            if (b->tx > TX_8X8) {
-                AV_WN16A(&s->eob[n], res);
-            } else {
-                s->eob[n] = res;
-            }
-        }
+
+#define SPLAT(la, end, step, cond) \
+    if (step == 2) { \
+        for (n = 1; n < end; n += step) \
+            la[n] = la[n - 1]; \
+    } else if (step == 4) { \
+        if (cond) { \
+            for (n = 0; n < end; n += step) \
+                AV_WN32A(&la[n], la[n] * 0x01010101); \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
+        } \
+    } else /* step == 8 */ { \
+        if (cond) { \
+            if (HAVE_FAST_64BIT) { \
+                for (n = 0; n < end; n += step) \
+                    AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
+            } else { \
+                for (n = 0; n < end; n += step) { \
+                    uint32_t v32 = la[n] * 0x01010101; \
+                    AV_WN32A(&la[n],     v32); \
+                    AV_WN32A(&la[n + 4], v32); \
+                } \
+            } \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
+        } \
     }
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
-        for (x = 0; x < end_x; x += step1d)
-            memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
+#define SPLAT_CTX(step) \
+    do { \
+        SPLAT(a, end_x, step, end_x == w4); \
+        SPLAT(l, end_y, step, end_y == h4); \
+    } while (0)
+
+    /* y tokens */
+    switch (b->tx) {
+    case TX_4X4:
+        DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
+        break;
+    case TX_8X8:
+        MERGE_CTX(2, AV_RN16A);
+        DECODE_Y_COEF_LOOP(2, 0,);
+        SPLAT_CTX(2);
+        break;
+    case TX_16X16:
+        MERGE_CTX(4, AV_RN32A);
+        DECODE_Y_COEF_LOOP(4, 0,);
+        SPLAT_CTX(4);
+        break;
+    case TX_32X32:
+        MERGE_CTX(8, AV_RN64A);
+        DECODE_Y_COEF_LOOP(8, 0, 32);
+        SPLAT_CTX(8);
+        break;
+    }
+
+#define DECODE_UV_COEF_LOOP(step) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
+                                  16 * step * step, c, e, p, a[x] + l[y], \
+                                  uvscan, uvnb, uv_band_counts, qmul[1]); \
+            a[x] = l[y] = !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->uveob[pl][n], res); \
+            } else { \
+                s->uveob[pl][n] = res; \
+            } \
+        } \
     }
 
     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
@@ -2059,33 +2276,31 @@ static void decode_coeffs(AVCodecContext *ctx)
     for (pl = 0; pl < 2; pl++) {
         a = &s->above_uv_nnz_ctx[pl][col];
         l = &s->left_uv_nnz_ctx[pl][row & 7];
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                for (x = 1; x < uvstep1d; x++)
-                    l[y] |= l[y + x];
-            for (x = 0; x < end_x; x += uvstep1d)
-                for (y = 1; y < uvstep1d; y++)
-                    a[x] |= a[x + y];
-        }
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
-                int nnz = a[x] + l[y];
-                res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
-                                      16 * uvstep, b->uvtx, c, e, p, nnz,
-                                      uvscan, uvnb, uv_band_counts, qmul[1]);
-                a[x] = l[y] = !!res;
-                if (b->uvtx > TX_8X8) {
-                    AV_WN16A(&s->uveob[pl][n], res);
-                } else {
-                    s->uveob[pl][n] = res;
-                }
-            }
-        }
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
-            for (x = 0; x < end_x; x += uvstep1d)
-                memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
+        switch (b->uvtx) {
+        case TX_4X4:
+            DECODE_UV_COEF_LOOP(1);
+            break;
+        case TX_8X8:
+            MERGE_CTX(2, AV_RN16A);
+            DECODE_UV_COEF_LOOP(2);
+            SPLAT_CTX(2);
+            break;
+        case TX_16X16:
+            MERGE_CTX(4, AV_RN32A);
+            DECODE_UV_COEF_LOOP(4);
+            SPLAT_CTX(4);
+            break;
+        case TX_32X32:
+            MERGE_CTX(8, AV_RN64A);
+            // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
+            // so there is no need to loop
+            res = decode_coeffs_b32(&s->c, s->uvblock[pl],
+                                    1024, c, e, p, a[0] + l[0],
+                                    uvscan, uvnb, uv_band_counts, qmul[1]);
+            a[0] = l[0] = !!res;
+            AV_WN16A(&s->uveob[pl][0], res);
+            SPLAT_CTX(8);
+            break;
         }
     }
 }
@@ -2211,11 +2426,11 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
 
             if (n_px_need <= n_px_have) {
                 for (i = 0; i < n_px_need; i++)
-                    l[i] = dst[i * stride - 1];
+                    l[n_px_need - 1 - i] = dst[i * stride - 1];
             } else {
                 for (i = 0; i < n_px_have; i++)
-                    l[i] = dst[i * stride - 1];
-                memset(&l[i], l[i - 1], n_px_need - n_px_have);
+                    l[n_px_need - 1 - i] = dst[i * stride - 1];
+                memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
             }
         } else {
             memset(l, 129, 4 << tx);
@@ -2237,6 +2452,8 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
     int uvstep1d = 1 << b->uvtx, p;
     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
+    LOCAL_ALIGNED_32(uint8_t, l, [32]);
 
     for (n = 0, y = 0; y < end_y; y += step1d) {
         uint8_t *ptr = dst, *ptr_r = dst_r;
@@ -2244,8 +2461,7 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
                                ptr_r += 4 * step1d, n += step) {
             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
                                y * 2 + x : 0];
-            LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-            uint8_t *a = &a_buf[16], l[32];
+            uint8_t *a = &a_buf[32];
             enum TxfmType txtp = vp9_intra_txfm_type[mode];
             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
 
@@ -2276,8 +2492,7 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
                                    ptr_r += 4 * uvstep1d, n += step) {
                 int mode = b->uvmode;
-                LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-                uint8_t *a = &a_buf[16], l[32];
+                uint8_t *a = &a_buf[16];
                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
 
                 mode = check_intra_mode(s, mode, &a, ptr_r,
@@ -2381,48 +2596,53 @@ static void inter_recon(AVCodecContext *ctx)
     VP9Context *s = ctx->priv_data;
     VP9Block *b = s->b;
     int row = s->row, col = s->col;
-    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
-    AVFrame *ref1 = tref1->f;
-    ThreadFrame *tref2 = b->comp ? &s->refs[s->refidx[b->ref[1]]] : NULL;
-    AVFrame *ref2 = b->comp ? tref2->f : NULL;
-    int w = ctx->width, h = ctx->height;
+    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
+    AVFrame *ref1 = tref1->f, *ref2;
+    int w1 = ref1->width, h1 = ref1->height, w2, h2;
     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
 
+    if (b->comp) {
+        tref2 = &s->refs[s->refidx[b->ref[1]]];
+        ref2 = tref2->f;
+        w2 = ref2->width;
+        h2 = ref2->height;
+    }
+
     // y inter pred
     if (b->bs > BS_8x8) {
         if (b->bs == BS_8x4) {
             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
+                        row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
                         s->dst[0] + 4 * ls_y, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
 
             if (b->comp) {
                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
+                            row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
                             s->dst[0] + 4 * ls_y, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
             }
         } else if (b->bs == BS_4x8) {
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
+                        row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
 
             if (b->comp) {
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
+                            row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
             }
         } else {
             av_assert2(b->bs == BS_4x4);
@@ -2431,34 +2651,34 @@ static void inter_recon(AVCodecContext *ctx)
             // do a w8 instead of a w4 call
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
+                        row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
                         s->dst[0] + 4 * ls_y, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
                         s->dst[0] + 4 * ls_y + 4, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
 
             if (b->comp) {
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
+                            row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
                             s->dst[0] + 4 * ls_y, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
                             s->dst[0] + 4 * ls_y + 4, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
             }
         }
     } else {
@@ -2467,12 +2687,12 @@ static void inter_recon(AVCodecContext *ctx)
 
         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
                     ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0],bw, bh, w, h);
+                    row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
 
         if (b->comp)
             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
                         ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
+                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
     }
 
     // uv inter pred
@@ -2481,8 +2701,12 @@ static void inter_recon(AVCodecContext *ctx)
         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
         VP56mv mvuv;
 
-        w = (w + 1) >> 1;
-        h = (h + 1) >> 1;
+        w1 = (w1 + 1) >> 1;
+        h1 = (h1 + 1) >> 1;
+        if (b->comp) {
+            w2 = (w2 + 1) >> 1;
+            h2 = (h2 + 1) >> 1;
+        }
         if (b->bs > BS_8x8) {
             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
@@ -2494,7 +2718,7 @@ static void inter_recon(AVCodecContext *ctx)
                       s->dst[1], s->dst[2], ls_uv,
                       ref1->data[1], ref1->linesize[1],
                       ref1->data[2], ref1->linesize[2], tref1,
-                      row << 2, col << 2, &mvuv, bw, bh, w, h);
+                      row << 2, col << 2, &mvuv, bw, bh, w1, h1);
 
         if (b->comp) {
             if (b->bs > BS_8x8) {
@@ -2507,7 +2731,7 @@ static void inter_recon(AVCodecContext *ctx)
                           s->dst[1], s->dst[2], ls_uv,
                           ref2->data[1], ref2->linesize[1],
                           ref2->data[2], ref2->linesize[2], tref2,
-                          row << 2, col << 2, &mvuv, bw, bh, w, h);
+                          row << 2, col << 2, &mvuv, bw, bh, w2, h2);
         }
     }
 
@@ -2698,7 +2922,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
     VP9Context *s = ctx->priv_data;
     VP9Block *b = s->b;
     enum BlockSize bs = bl * 3 + bp;
-    int y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
+    int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
     int emu[2];
     AVFrame *f = s->frames[CUR_FRAME].tf.f;
 
@@ -2720,13 +2944,34 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
         if (!b->skip) {
             decode_coeffs(ctx);
         } else {
-            int pl;
+            int row7 = s->row7;
+
+#define SPLAT_ZERO_CTX(v, n) \
+    switch (n) { \
+    case 1:  v = 0;          break; \
+    case 2:  AV_ZERO16(&v);  break; \
+    case 4:  AV_ZERO32(&v);  break; \
+    case 8:  AV_ZERO64(&v);  break; \
+    case 16: AV_ZERO128(&v); break; \
+    }
+#define SPLAT_ZERO_YUV(dir, var, off, n) \
+    do { \
+        SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
+        SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
+        SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+    } while (0)
 
-            memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
-            memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
-            for (pl = 0; pl < 2; pl++) {
-                memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
-                memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
+            switch (w4) {
+            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
+            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
+            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
+            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
+            }
+            switch (h4) {
+            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
+            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
+            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
+            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
             }
         }
         if (s->pass == 1) {
@@ -2746,9 +2991,9 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
     // allows to support emu-edge and so on even if we have large block
     // overhangs
     emu[0] = (col + w4) * 8 > f->linesize[0] ||
-             (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
+             (row + h4) > s->rows;
     emu[1] = (col + w4) * 4 > f->linesize[1] ||
-             (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
+             (row + h4) > s->rows;
     if (emu[0]) {
         s->dst[0] = s->tmp_y;
         s->y_stride = 64;
@@ -2806,12 +3051,11 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
                                                     [b->mode[3] != ZEROMV]) > 0) {
         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
-        int skip_inter = !b->intra && b->skip;
+        int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
 
-        for (y = 0; y < h4; y++)
-            memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
-        mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
-        mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
+        setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
+        mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        mask_edges(lflvl, 1, row7, col7, x_end, y_end,
                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
                    b->uvtx, skip_inter);
@@ -3449,6 +3693,13 @@ static void adapt_probs(VP9Context *s)
     }
 }
 
+static void free_buffers(VP9Context *s)
+{
+    av_freep(&s->intra_pred_data[0]);
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
+}
+
 static av_cold int vp9_decode_free(AVCodecContext *ctx)
 {
     VP9Context *s = ctx->priv_data;
@@ -3467,11 +3718,9 @@ static av_cold int vp9_decode_free(AVCodecContext *ctx)
             ff_thread_release_buffer(ctx, &s->next_refs[i]);
         av_frame_free(&s->next_refs[i].f);
     }
-    av_freep(&s->above_partition_ctx);
+    free_buffers(s);
     av_freep(&s->c_b);
     s->c_b_size = 0;
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
 
     return 0;
 }
@@ -3544,16 +3793,24 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
     memset(s->above_segpred_ctx, 0, s->cols);
     s->pass = s->uses_2pass =
         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
+    if ((res = update_block_buffers(ctx)) < 0) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Failed to allocate block buffers\n");
+        return res;
+    }
     if (s->refreshctx && s->parallelmode) {
         int j, k, l, m;
 
-        for (i = 0; i < 4; i++)
+        for (i = 0; i < 4; i++) {
             for (j = 0; j < 2; j++)
                 for (k = 0; k < 2; k++)
                     for (l = 0; l < 6; l++)
                         for (m = 0; m < 6; m++)
                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
                                    s->prob.coef[i][j][k][l][m], 3);
+            if (s->txfmmode == i)
+                break;
+        }
         s->prob_ctx[s->framectxid].p = s->prob.p;
         ff_thread_finish_setup(ctx);
     }
@@ -3583,11 +3840,15 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                         data += 4;
                         size -= 4;
                     }
-                    if (tile_size > size)
+                    if (tile_size > size) {
+                        ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
                         return AVERROR_INVALIDDATA;
+                    }
                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
-                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
+                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
+                        ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
                         return AVERROR_INVALIDDATA;
+                    }
                     data += tile_size;
                     size -= tile_size;
                 }
@@ -3758,7 +4019,11 @@ static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecCo
     int i, res;
     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
 
-    // FIXME scalability, size, etc.
+    // detect size changes in other threads
+    if (s->intra_pred_data[0] &&
+        (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
+        free_buffers(s);
+    }
 
     for (i = 0; i < 2; i++) {
         if (s->frames[i].tf.f->data[0])