summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/libvpx/source/libvpx/vp8
diff options
context:
space:
mode:
authorJocelyn Turcotte <jocelyn.turcotte@digia.com>2014-08-08 14:30:41 +0200
committerJocelyn Turcotte <jocelyn.turcotte@digia.com>2014-08-12 13:49:54 +0200
commitab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp8
parent4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)
Update Chromium to beta version 37.0.2062.68
Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vp8')
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/bilinearfilter_arm.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm357
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm130
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm135
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm183
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c696
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm43
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c42
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm131
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c142
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm87
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c102
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm117
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c111
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm469
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c625
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm207
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm209
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c184
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm36
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm139
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c123
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm490
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm422
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm473
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm524
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c1752
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm276
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c323
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/reconintra_arm.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/coefupdateprobs.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/common.h17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/default_coef_probs.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/entropymv.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/extend.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/filter.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/findnearmv.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/header.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/invtrans.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/modecont.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/mv.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/onyxc_int.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/onyxd.h6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/pragmas.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/quant_common.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra4x4.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl561
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.sh542
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/swapyv12buffer.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/systemdependent.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/treecoder.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/vp8_entropymodedata.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/filter_x86.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_mmx.asm7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/recon_sse2.asm79
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.h25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c (renamed from chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodframe.c)29
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/decoderthreading.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/ec_types.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c27
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/treereader.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c161
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.asm14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/boolhuff.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_cost.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_tokens.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/defaultcoefcounts.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c71
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemv.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/lookahead.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c160
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.c31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.h17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/treewriter.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.asm138
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c114
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk60
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c242
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8dx.mk2
155 files changed, 6171 insertions, 6053 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.h
index ea93c252280..93e99d76b1d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.h
@@ -9,15 +9,23 @@
*/
-#ifndef __INC_ALLOCCOMMON_H
-#define __INC_ALLOCCOMMON_H
+#ifndef VP8_COMMON_ALLOCCOMMON_H_
+#define VP8_COMMON_ALLOCCOMMON_H_
#include "onyxc_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_create_common(VP8_COMMON *oci);
void vp8_remove_common(VP8_COMMON *oci);
void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
void vp8_setup_version(VP8_COMMON *oci);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_ALLOCCOMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
index dc84c30daf5..39919579f80 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -53,7 +53,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -101,7 +101,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -127,7 +127,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
index adc353d2006..915ee499309 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@@ -51,7 +51,7 @@ loop
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
+ sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@ loop
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
+ sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
index dd2ce685c8b..3668dc517a9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -58,7 +58,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -89,7 +89,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -120,7 +120,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
index f972d9b5bac..b4e0959d1b3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -69,7 +69,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -111,7 +111,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -195,7 +195,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index f5da9c09eed..10863e2ec80 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -59,7 +59,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
+ subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -90,7 +90,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -121,7 +121,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@@ -154,7 +154,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
+ sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/bilinearfilter_arm.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/bilinearfilter_arm.h
index b7155d3f0a5..6b84e6f3b55 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/bilinearfilter_arm.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/bilinearfilter_arm.h
@@ -9,8 +9,12 @@
*/
-#ifndef BILINEARFILTER_ARM_H
-#define BILINEARFILTER_ARM_H
+#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
+#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern void vp8_filter_block2d_bil_first_pass_armv6
(
@@ -32,4 +36,8 @@ extern void vp8_filter_block2d_bil_second_pass_armv6
const short *vp8_filter
);
-#endif /* BILINEARFILTER_ARM_H */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c
index 70e72aa4774..1f8157f0b11 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/dequantize_arm.c
@@ -12,26 +12,9 @@
#include "vpx_config.h"
#include "vp8/common/blockd.h"
-#if HAVE_NEON
-extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
-#endif
-
#if HAVE_MEDIA
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-#endif
-
-#if HAVE_NEON
-
-void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
-{
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
-
- vp8_dequantize_b_loop_neon(Q, DQC, DQ);
-}
-#endif
-#if HAVE_MEDIA
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c
index 3bdc9675e71..f37ca636edc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -25,20 +25,24 @@ extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
-#if HAVE_NEON
+#if HAVE_NEON_ASM || HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
+#endif
+#if HAVE_NEON_ASM
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
-
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_NEON
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif
@@ -146,7 +150,9 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
+#endif
+#if HAVE_NEON_ASM
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
deleted file mode 100644
index e392786d43d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict16x16_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_bilinear_predict16x16_neon| PROC
- push {r4-r5, lr}
-
- adr r12, bifilter16_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r4], r5 ;store result
- vst1.u8 {d4, d5}, [r4], r5
- vst1.u8 {d6, d7}, [r4], r5
- vmov q11, q15
- vst1.u8 {d8, d9}, [r4], r5
-
- bne filt_blk2d_sp16x16_loop_neon
-
- add sp, sp, #272
-
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r4], r5 ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r4], r5
- vst1.u8 {d18, d19}, [r4], r5
- vst1.u8 {d20, d21}, [r4], r5
-
- bne filt_blk2d_fpo16x16_loop_neon
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r4], r5 ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r4], r5
- vmov q11, q15
- vst1.u8 {d6, d7}, [r4], r5
- vst1.u8 {d8, d9}, [r4], r5
-
- bne filt_blk2d_spo16x16_loop_neon
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
-bifilter16_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
deleted file mode 100644
index 0ac62436f97..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict4x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict4x4_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x4)
- vld1.u8 {d2}, [r0], r1 ;load src data
- add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
-
- vld1.u8 {d3}, [r0], r1
- vld1.u32 {d31}, [r2] ;first_pass filter
-
- vld1.u8 {d4}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
- vld1.u8 {d5}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {d6}, [r0], r1
-
- vshr.u64 q4, q1, #8 ;construct src_ptr[1]
- vshr.u64 q5, q2, #8
- vshr.u64 d12, d6, #8
-
- vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d4, d5
- vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q8, d4, d0
- vmull.u8 q9, d6, d0
-
- vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1])
- vmlal.u8 q8, d10, d1
- vmlal.u8 q9, d12, d1
-
- vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d29, q8, #7
- vqrshrn.u16 d30, q9, #7
-
-;Second pass: 4x4
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3 ;calculate Vfilter location
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d28, d0
- vmull.u8 q2, d29, d0
-
- vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
- vext.8 d27, d29, d30, #4
-
- vmlal.u8 q1, d26, d1
- vmlal.u8 q2, d27, d1
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
-
- vst1.32 {d2[0]}, [r4] ;store result
- vst1.32 {d2[1]}, [r0]
- vst1.32 {d3[0]}, [r1]
- vst1.32 {d3[1]}, [r2]
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
-
- vld1.32 {d28[0]}, [r0], r1 ;load src data
- vld1.32 {d28[1]}, [r0], r1
- vld1.32 {d29[0]}, [r0], r1
- vld1.32 {d29[1]}, [r0], r1
- vld1.32 {d30[0]}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.32 {d28[0]}, [r4], lr ;store result
- vst1.32 {d28[1]}, [r4], lr
- vst1.32 {d29[0]}, [r4], lr
- vst1.32 {d29[1]}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter4_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
deleted file mode 100644
index 41f5c45ffe7..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict8x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict8x4_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter8x4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vld1.u8 {q5}, [r0], r1
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q7, #7
- vqrshrn.u16 d24, q8, #7
- vqrshrn.u16 d25, q9, #7
- vqrshrn.u16 d26, q10, #7
-
-;Second pass: 4x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3
- add r0, r4, lr
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
- add r1, r0, lr
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
-
- add r2, r1, lr
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
-
- vst1.u8 {d2}, [r4] ;store result
- vst1.u8 {d3}, [r0]
- vst1.u8 {d4}, [r1]
- vst1.u8 {d5}, [r2]
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.u8 {d22}, [r4], lr ;store result
- vst1.u8 {d23}, [r4], lr
- vst1.u8 {d24}, [r4], lr
- vst1.u8 {d25}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter8x4_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
deleted file mode 100644
index c4711bc4d4a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict8x8_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter8_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3
- add r0, r4, lr
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
- add r1, r0, lr
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2}, [r4] ;store result
- vst1.u8 {d3}, [r0]
- vst1.u8 {d4}, [r1], lr
- vst1.u8 {d5}, [r1], lr
- vst1.u8 {d6}, [r1], lr
- vst1.u8 {d7}, [r1], lr
- vst1.u8 {d8}, [r1], lr
- vst1.u8 {d9}, [r1], lr
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.u8 {d22}, [r4], lr ;store result
- vst1.u8 {d23}, [r4], lr
- vst1.u8 {d24}, [r4], lr
- vst1.u8 {d25}, [r4], lr
- vst1.u8 {d26}, [r4], lr
- vst1.u8 {d27}, [r4], lr
- vst1.u8 {d28}, [r4], lr
- vst1.u8 {d29}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter8_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
new file mode 100644
index 00000000000..e1c3c2be7df
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const uint16_t bifilter4_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+void vp8_bilinear_predict4x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint32x2_t d28u32, d29u32, d30u32;
+ uint8x16_t q1u8, q2u8;
+ uint16x8_t q1u16, q2u16;
+ uint16x8_t q7u16, q8u16, q9u16;
+ uint64x2_t q4u64, q5u64;
+ uint64x1_t d12u64;
+ uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
+ src_ptr += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
+ src_ptr += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
+ src_ptr += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
+ src_ptr += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
+ d28u8 = vreinterpret_u8_u32(d28u32);
+ d29u8 = vreinterpret_u8_u32(d29u32);
+ d30u8 = vreinterpret_u8_u32(d30u32);
+ } else {
+ d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d6u8 = vld1_u8(src_ptr);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+
+ d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+
+ q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
+ q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
+ d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
+ vreinterpret_u32_u8(vget_high_u8(q1u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
+ vreinterpret_u32_u8(vget_high_u8(q2u8)));
+ d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
+ vreinterpret_u32_u64(vget_high_u64(q4u64)));
+ d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+ q9u16 = vmull_u8(d6u8, d0u8);
+
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
+ q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
+
+ d28u8 = vqrshrn_n_u16(q7u16, 7);
+ d29u8 = vqrshrn_n_u16(q8u16, 7);
+ d30u8 = vqrshrn_n_u16(q9u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d28u8, d0u8);
+ q2u16 = vmull_u8(d29u8, d0u8);
+
+ d26u8 = vext_u8(d28u8, d29u8, 4);
+ d27u8 = vext_u8(d29u8, d30u8, 4);
+
+ q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
+ uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+ d26u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d30u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+
+ // first_pass filtering on the rest 5-line data
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d26u8 = vqrshrn_n_u16(q6u16, 7);
+ d27u8 = vqrshrn_n_u16(q7u16, 7);
+ d28u8 = vqrshrn_n_u16(q8u16, 7);
+ d29u8 = vqrshrn_n_u16(q9u16, 7);
+ d30u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d29u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+ q5u16 = vmull_u8(d26u8, d0u8);
+ q6u16 = vmull_u8(d27u8, d0u8);
+ q7u16 = vmull_u8(d28u8, d0u8);
+ q8u16 = vmull_u8(d29u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+ q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d9u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ int i;
+ unsigned char tmp[272];
+ unsigned char *tmpp;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+ uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 =vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ tmpp = tmp;
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp); tmpp += 16;
+ q13u8 = vld1q_u8(tmpp); tmpp += 16;
+ q14u8 = vld1q_u8(tmpp); tmpp += 16;
+ q15u8 = vld1q_u8(tmpp); tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
index e3ea91fe6c0..a8730aa04ef 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -26,6 +26,7 @@
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
+ vpush {d8-d15}
cmp r3, #0
beq case_dc_pred
@@ -37,8 +38,8 @@
beq case_tm_pred
case_dc_pred
- ldr r4, [sp, #24] ; Up
- ldr r5, [sp, #28] ; Left
+ ldr r4, [sp, #88] ; Up
+ ldr r5, [sp, #92] ; Left
; Default the DC average to 128
mov r12, #128
@@ -143,6 +144,7 @@ skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
+ vpop {d8-d15}
pop {r4-r8,pc}
case_v_pred
; Copy down above row
@@ -165,6 +167,7 @@ case_v_pred
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
+ vpop {d8-d15}
pop {r4-r8,pc}
case_h_pred
@@ -224,6 +227,7 @@ case_h_pred
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
+ vpop {d8-d15}
pop {r4-r8,pc}
case_tm_pred
@@ -293,6 +297,7 @@ case_tm_pred_loop
subs r12, r12, #1
bne case_tm_pred_loop
+ vpop {d8-d15}
pop {r4-r8,pc}
ENDP
@@ -307,6 +312,7 @@ case_tm_pred_loop
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
+ vpush {d8-d15}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
@@ -320,8 +326,8 @@ case_tm_pred_loop
beq case_tm_pred_s
case_dc_pred_s
- ldr r4, [sp, #24] ; Up
- ldr r5, [sp, #28] ; Left
+ ldr r4, [sp, #88] ; Up
+ ldr r5, [sp, #92] ; Left
; Default the DC average to 128
mov r12, #128
@@ -426,6 +432,7 @@ skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
+ vpop {d8-d15}
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
@@ -448,6 +455,8 @@ case_v_pred_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
+
+ vpop {d8-d15}
pop {r4-r8,pc}
case_h_pred_s
@@ -507,6 +516,7 @@ case_h_pred_s
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
+ vpop {d8-d15}
pop {r4-r8,pc}
case_tm_pred_s
@@ -576,6 +586,7 @@ case_tm_pred_loop_s
subs r12, r12, #1
bne case_tm_pred_loop_s
+ vpop {d8-d15}
pop {r4-r8,pc}
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
deleted file mode 100644
index bda4b965442..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem16x16_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem16x16_neon| PROC
-
- vld1.u8 {q0}, [r0], r1
- vld1.u8 {q1}, [r0], r1
- vld1.u8 {q2}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- vld1.u8 {q3}, [r0], r1
- vst1.u8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {q2}, [r2], r3
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {q4}, [r2], r3
- vld1.u8 {q7}, [r0], r1
- vst1.u8 {q5}, [r2], r3
- vld1.u8 {q8}, [r0], r1
- vst1.u8 {q6}, [r2], r3
- vld1.u8 {q9}, [r0], r1
- vst1.u8 {q7}, [r2], r3
- vld1.u8 {q10}, [r0], r1
- vst1.u8 {q8}, [r2], r3
- vld1.u8 {q11}, [r0], r1
- vst1.u8 {q9}, [r2], r3
- vld1.u8 {q12}, [r0], r1
- vst1.u8 {q10}, [r2], r3
- vld1.u8 {q13}, [r0], r1
- vst1.u8 {q11}, [r2], r3
- vld1.u8 {q14}, [r0], r1
- vst1.u8 {q12}, [r2], r3
- vld1.u8 {q15}, [r0], r1
- vst1.u8 {q13}, [r2], r3
- vst1.u8 {q14}, [r2], r3
- vst1.u8 {q15}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem16x16_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
deleted file mode 100644
index 35c0f6708a5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x4_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x4_neon| PROC
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vst1.u8 {d3}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x4_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
deleted file mode 100644
index 1f5b9411bb5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x8_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x8_neon| PROC
-
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vld1.u8 {d4}, [r0], r1
- vst1.u8 {d3}, [r2], r3
- vld1.u8 {d5}, [r0], r1
- vst1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r0], r1
- vst1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r0], r1
- vst1.u8 {d6}, [r2], r3
- vst1.u8 {d7}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x8_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 00000000000..deced115c14
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 4; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 8; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem16x16_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+ uint8x16_t qtmp;
+
+ for (r = 0; r < 16; r++) {
+ qtmp = vld1q_u8(src);
+ vst1q_u8(dst, qtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
deleted file mode 100644
index 79ff02c6940..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,54 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dc_only_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-; int pred_stride, unsigned char *dst_ptr,
-; int dst_stride)
-
-; r0 input_dc
-; r1 pred_ptr
-; r2 pred_stride
-; r3 dst_ptr
-; sp dst_stride
-
-|vp8_dc_only_idct_add_neon| PROC
- add r0, r0, #4
- asr r0, r0, #3
- ldr r12, [sp]
- vdup.16 q0, r0
-
- vld1.32 {d2[0]}, [r1], r2
- vld1.32 {d2[1]}, [r1], r2
- vld1.32 {d4[0]}, [r1], r2
- vld1.32 {d4[1]}, [r1]
-
- vaddw.u8 q1, q0, d2
- vaddw.u8 q2, q0, d4
-
- vqmovun.s16 d2, q1
- vqmovun.s16 d4, q2
-
- vst1.32 {d2[0]}, [r3], r12
- vst1.32 {d2[1]}, [r3], r12
- vst1.32 {d4[0]}, [r3], r12
- vst1.32 {d4[1]}, [r3]
-
- bx lr
-
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 00000000000..ad5f41d7dee
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+ int16_t input_dc,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint16_t a1 = ((input_dc + 4) >> 3);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint8x8_t d2u8;
+ uint16x8_t q1u16;
+ uint16x8_t qAdd;
+
+ qAdd = vdupq_n_u16(a1);
+
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+ pred_ptr += pred_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_stride;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
deleted file mode 100644
index 602cce67697..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_add_neon(short *input, short *dq,
-; unsigned char *dest, int stride)
-; r0 short *input,
-; r1 short *dq,
-; r2 unsigned char *dest
-; r3 int stride
-
-|vp8_dequant_idct_add_neon| PROC
- vld1.16 {q3, q4}, [r0]
- vld1.16 {q5, q6}, [r1]
-
- add r1, r2, r3 ; r1 = dest + stride
- lsl r3, #1 ; 2x stride
-
- vld1.32 {d14[0]}, [r2], r3
- vld1.32 {d14[1]}, [r1], r3
- vld1.32 {d15[0]}, [r2]
- vld1.32 {d15[1]}, [r1]
-
- adr r12, cospi8sqrt2minus1 ; pointer to the first constant
-
- vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
- vmul.i16 q2, q4, q6
-
-;|short_idct4x4llm_neon| PROC
- vld1.16 {d0}, [r12]
- vswp d3, d4 ;q2(vp[4] vp[12])
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
-; memset(input, 0, 32) -- 32bytes
- vmov.i16 q14, #0
-
- vswp d3, d4
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vmov q15, q14
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vst1.16 {q14, q15}, [r0]
-
- vrshr.s16 d2, d2, #3
- vrshr.s16 d3, d3, #3
- vrshr.s16 d4, d4, #3
- vrshr.s16 d5, d5, #3
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vaddw.u8 q1, q1, d14
- vaddw.u8 q2, q2, d15
-
- sub r2, r2, r3
- sub r1, r1, r3
-
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
-
- vst1.32 {d0[0]}, [r2], r3
- vst1.32 {d0[1]}, [r1], r3
- vst1.32 {d1[0]}, [r2]
- vst1.32 {d1[1]}, [r1]
-
- bx lr
-
- ENDP ; |vp8_dequant_idct_add_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2 DCD 0x8a8c8a8c
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 00000000000..58e11922c76
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_dequant_idct_add_neon(
+ int16_t *input,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int32x2_t d14, d15;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ int16x8_t q1, q2, q3, q4, q5, q6;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x2x2_t d2tmp0, d2tmp1;
+ int16x4x2_t d2tmp2, d2tmp3;
+
+ d14 = d15 = vdup_n_s32(0);
+
+ // load input
+ q3 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+ input += 8;
+ q4 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+
+ // load dq
+ q5 = vld1q_s16(dq);
+ dq += 8;
+ q6 = vld1q_s16(dq);
+
+ // load src from dst
+ dst0 = dst;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+ q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+ vreinterpretq_u16_s16(q5)));
+ q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+ vreinterpretq_u16_s16(q6)));
+
+ d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+ d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+ q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ // loop 2
+ q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+ d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+ q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+ q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+ vreinterpret_u8_s32(d14)));
+ q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+ vreinterpret_u8_s32(d15)));
+
+ d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+ d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 1);
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
deleted file mode 100644
index c8e0c31f29c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequantize_b_loop_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 short *Q,
-; r1 short *DQC
-; r2 short *DQ
-|vp8_dequantize_b_loop_neon| PROC
- vld1.16 {q0, q1}, [r0]
- vld1.16 {q2, q3}, [r1]
-
- vmul.i16 q4, q0, q2
- vmul.i16 q5, q1, q3
-
- vst1.16 {q4, q5}, [r2]
-
- bx lr
-
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 00000000000..54e709dd3c3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
+ int16x8x2_t qQ, qDQC, qDQ;
+
+ qQ = vld2q_s16(d->qcoeff);
+ qDQC = vld2q_s16(DQC);
+
+ qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+ qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+ vst2q_s16(d->dqcoeff, qDQ);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
index 6c29c55860d..3a3921081c4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -22,6 +22,7 @@
; r3 stride
|idct_dequant_0_2x_neon| PROC
push {r4, r5}
+ vpush {d8-d15}
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
@@ -72,6 +73,7 @@
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
+ vpop {d8-d15}
pop {r4, r5}
bx lr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
index d5dce63f6bd..8da0fa0b7ea 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -22,6 +22,8 @@
; r2 *dst
; r3 stride
|idct_dequant_full_2x_neon| PROC
+ vpush {d8-d15}
+
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
add r0, r0, #32
@@ -184,6 +186,7 @@
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
+ vpop {d8-d15}
bx lr
ENDP ; |idct_dequant_full_2x_neon|
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
deleted file mode 100644
index e8ea2a61976..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
+++ /dev/null
@@ -1,87 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
- EXPORT |vp8_short_inv_walsh4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
-|vp8_short_inv_walsh4x4_neon| PROC
-
- ; read in all four lines of values: d0->d3
- vld1.i16 {q0-q1}, [r0@128]
-
- ; first for loop
- vadd.s16 d4, d0, d3 ;a = [0] + [12]
- vadd.s16 d6, d1, d2 ;b = [4] + [8]
- vsub.s16 d5, d0, d3 ;d = [0] - [12]
- vsub.s16 d7, d1, d2 ;c = [4] - [8]
-
- vadd.s16 q0, q2, q3 ; a+b d+c
- vsub.s16 q1, q2, q3 ; a-b d-c
-
- vtrn.32 d0, d2 ;d0: 0 1 8 9
- ;d2: 2 3 10 11
- vtrn.32 d1, d3 ;d1: 4 5 12 13
- ;d3: 6 7 14 15
-
- vtrn.16 d0, d1 ;d0: 0 4 8 12
- ;d1: 1 5 9 13
- vtrn.16 d2, d3 ;d2: 2 6 10 14
- ;d3: 3 7 11 15
-
- ; second for loop
-
- vadd.s16 d4, d0, d3 ;a = [0] + [3]
- vadd.s16 d6, d1, d2 ;b = [1] + [2]
- vsub.s16 d5, d0, d3 ;d = [0] - [3]
- vsub.s16 d7, d1, d2 ;c = [1] - [2]
-
- vmov.i16 q8, #3
-
- vadd.s16 q0, q2, q3 ; a+b d+c
- vsub.s16 q1, q2, q3 ; a-b d-c
-
- vadd.i16 q0, q0, q8 ;e/f += 3
- vadd.i16 q1, q1, q8 ;g/h += 3
-
- vshr.s16 q0, q0, #3 ;e/f >> 3
- vshr.s16 q1, q1, #3 ;g/h >> 3
-
- mov r2, #64
- add r3, r1, #32
-
- vst1.i16 d0[0], [r1],r2
- vst1.i16 d1[0], [r3],r2
- vst1.i16 d2[0], [r1],r2
- vst1.i16 d3[0], [r3],r2
-
- vst1.i16 d0[1], [r1],r2
- vst1.i16 d1[1], [r3],r2
- vst1.i16 d2[1], [r1],r2
- vst1.i16 d3[1], [r3],r2
-
- vst1.i16 d0[2], [r1],r2
- vst1.i16 d1[2], [r3],r2
- vst1.i16 d2[2], [r1],r2
- vst1.i16 d3[2], [r3],r2
-
- vst1.i16 d0[3], [r1],r2
- vst1.i16 d1[3], [r3],r2
- vst1.i16 d2[3], [r1]
- vst1.i16 d3[3], [r3]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 00000000000..6ea9dd712aa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_inv_walsh4x4_neon(
+ int16_t *input,
+ int16_t *mb_dqcoeff) {
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+ int16x8_t qAdd3;
+
+ q0s16 = vld1q_s16(input);
+ q1s16 = vld1q_s16(input + 8);
+
+ // 1st for loop
+ d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+ d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+ vreinterpret_s32_s16(vget_low_s16(q1s16)));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+ vreinterpret_s32_s16(vget_high_s16(q1s16)));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+ vreinterpret_s16_s32(v2tmp3.val[0]));
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+ vreinterpret_s16_s32(v2tmp3.val[1]));
+
+ // 2nd for loop
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ qAdd3 = vdupq_n_s16(3);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ q0s16 = vaddq_s16(q0s16, qAdd3);
+ q1s16 = vaddq_s16(q1s16, qAdd3);
+
+ q0s16 = vshrq_n_s16(q0s16, 3);
+ q1s16 = vshrq_n_s16(q1s16, 3);
+
+ // store
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
index e44be0a1e34..c4f09c7753b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
@@ -24,10 +24,12 @@
; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
+ vpush {d8-d15}
+
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r3, [sp, #4] ; load thresh
+ ldr r3, [sp, #68] ; load thresh
add r12, r2, r1
add r1, r1, r1
@@ -52,6 +54,7 @@
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
+ vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
@@ -64,10 +67,12 @@
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
+ vpush {d8-d15}
+
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
- ldr r12, [sp, #4] ; load thresh
- ldr r2, [sp, #8] ; load v ptr
+ ldr r12, [sp, #68] ; load thresh
+ ldr r2, [sp, #72] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
@@ -104,6 +109,7 @@
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
+ vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
@@ -120,11 +126,13 @@
|vp8_loop_filter_vertical_edge_y_neon| PROC
push {lr}
+ vpush {d8-d15}
+
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
- ldr r3, [sp, #4] ; load thresh
+ ldr r3, [sp, #68] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1
@@ -194,6 +202,7 @@
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
+ vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
@@ -210,9 +219,11 @@
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
+ vpush {d8-d15}
+
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
- ldr r2, [sp, #8] ; load v ptr
+ ldr r2, [sp, #72] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
@@ -233,7 +244,7 @@
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
- ldr r12, [sp, #4] ; load thresh
+ ldr r12, [sp, #68] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -281,6 +292,7 @@
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
+ vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
deleted file mode 100644
index adf848b9c34..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ /dev/null
@@ -1,117 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
- EXPORT |vp8_loop_filter_bhs_neon|
- EXPORT |vp8_loop_filter_mbhs_neon|
- ARM
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *s, PRESERVE
-; r1 int p, PRESERVE
-; q1 limit, PRESERVE
-
-|vp8_loop_filter_simple_horizontal_edge_neon| PROC
-
- sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
-
- vld1.u8 {q7}, [r0@128], r1 ; q0
- vld1.u8 {q5}, [r3@128], r1 ; p0
- vld1.u8 {q8}, [r0@128] ; q1
- vld1.u8 {q6}, [r3@128] ; p1
-
- vabd.u8 q15, q6, q7 ; abs(p0 - q0)
- vabd.u8 q14, q5, q8 ; abs(p1 - q1)
-
- vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
- vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
- vmov.u8 q0, #0x80 ; 0x80
- vmov.s16 q13, #3
- vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
- veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
- veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
- veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
-
- vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q3, d15, d13
-
- vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
- vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
- vmul.s16 q3, q3, q13
-
- vmov.u8 q10, #0x03 ; 0x03
- vmov.u8 q9, #0x04 ; 0x04
-
- vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q3, q3, d9
-
- vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d9, q3
-
- vand q14, q4, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q4, q3, #3 ; Filter1 >>= 3
-
- sub r0, r0, r1
-
- ;calculate output
- vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
-
- vst1.u8 {q6}, [r3@128] ; store op0
- vst1.u8 {q7}, [r0@128] ; store oq0
-
- bx lr
- ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_bhs_neon| PROC
- push {r4, lr}
- ldrb r3, [r2] ; load blim from mem
- vdup.s8 q1, r3 ; duplicate blim
-
- add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
- bl vp8_loop_filter_simple_horizontal_edge_neon
- ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
- add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
- bl vp8_loop_filter_simple_horizontal_edge_neon
- add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
- pop {r4, lr}
- b vp8_loop_filter_simple_horizontal_edge_neon
- ENDP ;|vp8_loop_filter_bhs_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_mbhs_neon| PROC
- ldrb r3, [r2] ; load blim from mem
- vdup.s8 q1, r3 ; duplicate mblim
- b vp8_loop_filter_simple_horizontal_edge_neon
- ENDP ;|vp8_loop_filter_bhs_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 00000000000..b25686ffb88
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ uint8_t *sp;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+ int16x8_t q2s16, q3s16, q13s16;
+ int8x8_t d8s8, d9s8;
+ int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ sp = s - (p << 1);
+ q5u8 = vld1q_u8(sp);
+ sp += p;
+ q6u8 = vld1q_u8(sp);
+ sp += p;
+ q7u8 = vld1q_u8(sp);
+ sp += p;
+ q8u8 = vld1q_u8(sp);
+
+ q15u8 = vabdq_u8(q6u8, q7u8);
+ q14u8 = vabdq_u8(q5u8, q8u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q13s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+ q7u8 = veorq_u8(q7u8, q0u8);
+ q8u8 = veorq_u8(q8u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+ q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+ q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
+ vreinterpretq_s8_u8(q8u8));
+
+ q2s16 = vmulq_s16(q2s16, q13s16);
+ q3s16 = vmulq_s16(q3s16, q13s16);
+
+ q10u8 = vdupq_n_u8(3);
+ q9u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+ q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+ d8s8 = vqmovn_s16(q2s16);
+ d9s8 = vqmovn_s16(q3s16);
+ q4s8 = vcombine_s8(d8s8, d9s8);
+
+ q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q3s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ vst1q_u8(s, q7u8);
+ s -= p;
+ vst1q_u8(s, q6u8);
+ return;
+}
+
+void vp8_loop_filter_bhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index e690df2f7de..78d13c895aa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,7 +9,6 @@
;
- ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
ARM
@@ -22,6 +21,8 @@
; q1 limit, PRESERVE
|vp8_loop_filter_simple_vertical_edge_neon| PROC
+ vpush {d8-d15}
+
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
@@ -120,6 +121,7 @@
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
+ vpop {d8-d15}
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
deleted file mode 100644
index f41c156df8b..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ /dev/null
@@ -1,469 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
- EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
- EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
- EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh)
-; r0 unsigned char *src,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- add r1, r1, r1 ; double stride
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
- vdup.u8 q2, r12 ; thresh
- add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
-
- vld1.u8 {q3}, [r0@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r0@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r0@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r0@128], r1 ; q2
- vld1.u8 {q10}, [r12@128], r1 ; q3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #2
- add r0, r12, r1, lsr #1
-
- vst1.u8 {q4}, [r12@128],r1 ; store op2
- vst1.u8 {q5}, [r0@128],r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r0@128],r1 ; store oq0
- vst1.u8 {q8}, [r12@128] ; store oq1
- vst1.u8 {q9}, [r0@128] ; store oq2
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
- vdup.u8 q2, r12 ; thresh
- ldr r12, [sp, #8] ; load v ptr
- sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r0@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r0@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r0@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r0@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r0@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r0@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r0@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r0@64], r1 ; q3
- vld1.u8 {d21}, [r12@64], r1 ; q3
-
- bl vp8_mbloop_filter_neon
-
- sub r0, r0, r1, lsl #3
- sub r12, r12, r1, lsl #3
-
- add r0, r0, r1
- add r12, r12, r1
-
- vst1.u8 {d8}, [r0@64], r1 ; store u op2
- vst1.u8 {d9}, [r12@64], r1 ; store v op2
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r12@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r12@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r12@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64], r1 ; store u oq1
- vst1.u8 {d17}, [r12@64], r1 ; store v oq1
- vst1.u8 {d18}, [r0@64], r1 ; store u oq2
- vst1.u8 {d19}, [r12@64], r1 ; store v oq2
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh)
-; r0 unsigned char *src,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, #4 ; move src pointer down by 4 columns
- vdup.s8 q2, r12 ; thresh
- add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
-
- vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
- vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
- vld1.u8 {d8}, [r0], r1
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d10}, [r0], r1
- vld1.u8 {d11}, [r12], r1
- vld1.u8 {d12}, [r0], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d14}, [r0], r1
- vld1.u8 {d15}, [r12], r1
- vld1.u8 {d16}, [r0], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d18}, [r0], r1
- vld1.u8 {d19}, [r12], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r12], r1
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- sub r0, r0, r1, lsl #3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #3
-
- ;transpose to 16x8 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- ;store op2, op1, op0, oq0, oq1, oq2
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r12], r1
- vst1.8 {d8}, [r0], r1
- vst1.8 {d9}, [r12], r1
- vst1.8 {d10}, [r0], r1
- vst1.8 {d11}, [r12], r1
- vst1.8 {d12}, [r0], r1
- vst1.8 {d13}, [r12], r1
- vst1.8 {d14}, [r0], r1
- vst1.8 {d15}, [r12], r1
- vst1.8 {d16}, [r0], r1
- vst1.8 {d17}, [r12], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d19}, [r12], r1
- vst1.8 {d20}, [r0]
- vst1.8 {d21}, [r12]
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
-; sp+4 unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, #4 ; move u pointer down by 4 columns
- vdup.u8 q2, r12 ; thresh
- ldr r12, [sp, #8] ; load v ptr
- sub r12, r12, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r0], r1 ;load u data
- vld1.u8 {d7}, [r12], r1 ;load v data
- vld1.u8 {d8}, [r0], r1
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d10}, [r0], r1
- vld1.u8 {d11}, [r12], r1
- vld1.u8 {d12}, [r0], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d14}, [r0], r1
- vld1.u8 {d15}, [r12], r1
- vld1.u8 {d16}, [r0], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d18}, [r0], r1
- vld1.u8 {d19}, [r12], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r12], r1
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- sub r0, r0, r1, lsl #3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #3
-
- ;transpose to 16x8 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- ;store op2, op1, op0, oq0, oq1, oq2
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r12], r1
- vst1.8 {d8}, [r0], r1
- vst1.8 {d9}, [r12], r1
- vst1.8 {d10}, [r0], r1
- vst1.8 {d11}, [r12], r1
- vst1.8 {d12}, [r0], r1
- vst1.8 {d13}, [r12], r1
- vst1.8 {d14}, [r0], r1
- vst1.8 {d15}, [r12], r1
- vst1.8 {d16}, [r0], r1
- vst1.8 {d17}, [r12], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d19}, [r12], r1
- vst1.8 {d20}, [r0]
- vst1.8 {d21}, [r12]
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-; void vp8_mbloop_filter_neon()
-; This is a helper function for the macroblock loopfilters. The individual
-; functions do the necessary load, transpose (if necessary), preserve (if
-; necessary) and store.
-
-; r0,r1 PRESERVE
-; r2 mblimit
-; r3 limit
-
-; q2 thresh
-; q3 p3 PRESERVE
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3 PRESERVE
-
-|vp8_mbloop_filter_neon| PROC
-
- ; vp8_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q1, q9, q8 ; abs(q2 - q1)
- vabd.u8 q0, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q1, q1, q0
- vmax.u8 q15, q11, q12
-
- vabd.u8 q12, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
- vmax.u8 q15, q15, q1
-
- vdup.u8 q1, r3 ; limit
- vdup.u8 q2, r2 ; mblimit
-
- vmov.u8 q0, #0x80 ; 0x80
-
- vcge.u8 q15, q1, q15
-
- vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
- vmov.u16 q11, #3 ; #3
-
- ; vp8_filter
- ; convert to signed
- veor q7, q7, q0 ; qs0
- vshr.u8 q1, q1, #1 ; a = a / 2
- veor q6, q6, q0 ; ps0
- veor q5, q5, q0 ; ps1
-
- vqadd.u8 q12, q12, q1 ; a = b + a
-
- veor q8, q8, q0 ; qs1
- veor q4, q4, q0 ; ps2
- veor q9, q9, q0 ; qs2
-
- vorr q14, q13, q14 ; vp8_hevmask
-
- vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
-
- vsubl.s8 q2, d14, d12 ; qs0 - ps0
- vsubl.s8 q13, d15, d13
-
- vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
-
- vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
-
- vand q15, q15, q12 ; vp8_filter_mask
-
- vmul.i16 q13, q13, q11
-
- vmov.u8 q12, #3 ; #3
-
- vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d3
-
- vmov.u8 q11, #4 ; #4
-
- ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q13
-
- vand q1, q1, q15 ; vp8_filter &= mask
-
- vmov.u16 q15, #63 ; #63
-
- vand q13, q1, q14 ; Filter2 &= hev
-
- vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
- vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
-
- vmov q0, q15
-
- vshr.s8 q2, q2, #3 ; Filter1 >>= 3
- vshr.s8 q13, q13, #3 ; Filter2 >>= 3
-
- vmov q11, q15
- vmov q12, q15
-
- vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
-
- vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
-
- vbic q1, q1, q14 ; vp8_filter &= ~hev
-
- ; roughly 1/7th difference across boundary
- ; roughly 2/7th difference across boundary
- ; roughly 3/7th difference across boundary
-
- vmov.u8 d5, #9 ; #9
- vmov.u8 d4, #18 ; #18
-
- vmov q13, q15
- vmov q14, q15
-
- vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
- vmlal.s8 q11, d3, d5
- vmov.u8 d5, #27 ; #27
- vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
- vmlal.s8 q13, d3, d4
- vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
- vmlal.s8 q15, d3, d5
-
- vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
- vqshrn.s16 d1, q11, #7
- vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
- vqshrn.s16 d25, q13, #7
- vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
- vqshrn.s16 d29, q15, #7
-
- vmov.u8 q1, #0x80 ; 0x80
-
- vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
- vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
- vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
- vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
- vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
- vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
-
- veor q9, q11, q1 ; *oq2 = s^0x80
- veor q4, q0, q1 ; *op2 = s^0x80
- veor q8, q13, q1 ; *oq1 = s^0x80
- veor q5, q12, q1 ; *op2 = s^0x80
- veor q7, q15, q1 ; *oq0 = s^0x80
- veor q6, q14, q1 ; *op0 = s^0x80
-
- bx lr
- ENDP ; |vp8_mbloop_filter_neon|
-
-;-----------------
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 00000000000..5351f4be665
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_mbloop_filter_neon(
+ uint8x16_t qblimit, // mblimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p2
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q4r, // p1
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r, // q1
+ uint8x16_t *q9r) { // q1
+ uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+ uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+ int8x16_t q0s8, q12s8, q14s8, q15s8;
+ int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q1u8 = vabdq_u8(q9, q8);
+ q0u8 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q1u8 = vmaxq_u8(q1u8, q0u8);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q12u8 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q1u8);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ q1u8 = vabdq_u8(q5, q8);
+ q12u8 = vqaddq_u8(q12u8, q12u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q0u8 = vdupq_n_u8(0x80);
+ q9 = veorq_u8(q9, q0u8);
+ q8 = veorq_u8(q8, q0u8);
+ q7 = veorq_u8(q7, q0u8);
+ q6 = veorq_u8(q6, q0u8);
+ q5 = veorq_u8(q5, q0u8);
+ q4 = veorq_u8(q4, q0u8);
+
+ q1u8 = vshrq_n_u8(q1u8, 1);
+ q12u8 = vqaddq_u8(q12u8, q1u8);
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+ q12u8 = vcgeq_u8(qblimit, q12u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q11s16 = vdupq_n_s16(3);
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q15u8 = vandq_u8(q15u8, q12u8);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+ q12u8 = vdupq_n_u8(3);
+ q11u8 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2 = vqmovn_s16(q2s16);
+ d3 = vqmovn_s16(q13s16);
+ q1s8 = vcombine_s8(d2, d3);
+ q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+ q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+ q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q13s8 = vshrq_n_s8(q13s8, 3);
+
+ q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+ q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+ d5 = vdup_n_s8(9);
+ d4 = vdup_n_s8(18);
+
+ q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
+ q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+ d5 = vdup_n_s8(27);
+ q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
+ q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+ q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
+ q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+ d0 = vqshrn_n_s16(q0s16 , 7);
+ d1 = vqshrn_n_s16(q11s16, 7);
+ d24 = vqshrn_n_s16(q12s16, 7);
+ d25 = vqshrn_n_s16(q13s16, 7);
+ d28 = vqshrn_n_s16(q14s16, 7);
+ d29 = vqshrn_n_s16(q15s16, 7);
+
+ q0s8 = vcombine_s8(d0, d1);
+ q12s8 = vcombine_s8(d24, d25);
+ q14s8 = vcombine_s8(d28, d29);
+
+ q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+ q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+ q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+ q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+ q15s8 = vqsubq_s8((q7s8), q14s8);
+ q14s8 = vqaddq_s8((q6s8), q14s8);
+
+ q1u8 = vdupq_n_u8(0x80);
+ *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+ *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ src -= (pitch * 6);
+ vst1q_u8(src, q4);
+ src += pitch;
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ src += pitch;
+ vst1q_u8(src, q9);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ u -= (pitch * 6);
+ v -= (pitch * 6);
+ vst1_u8(u, vget_low_u8(q4));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q4));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q9));
+ vst1_u8(v, vget_high_u8(q9));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s1, *s2;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s1 = src - 4;
+ s2 = s1 + 8 * pitch;
+ d6 = vld1_u8(s1);
+ s1 += pitch;
+ d7 = vld1_u8(s2);
+ s2 += pitch;
+ d8 = vld1_u8(s1);
+ s1 += pitch;
+ d9 = vld1_u8(s2);
+ s2 += pitch;
+ d10 = vld1_u8(s1);
+ s1 += pitch;
+ d11 = vld1_u8(s2);
+ s2 += pitch;
+ d12 = vld1_u8(s1);
+ s1 += pitch;
+ d13 = vld1_u8(s2);
+ s2 += pitch;
+ d14 = vld1_u8(s1);
+ s1 += pitch;
+ d15 = vld1_u8(s2);
+ s2 += pitch;
+ d16 = vld1_u8(s1);
+ s1 += pitch;
+ d17 = vld1_u8(s2);
+ s2 += pitch;
+ d18 = vld1_u8(s1);
+ s1 += pitch;
+ d19 = vld1_u8(s2);
+ s2 += pitch;
+ d20 = vld1_u8(s1);
+ d21 = vld1_u8(s2);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ s1 -= 7 * pitch;
+ s2 -= 7 * pitch;
+
+ vst1_u8(s1, vget_low_u8(q3));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q3));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q4));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q4));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q5));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q5));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q6));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q6));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q7));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q7));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q8));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q8));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q9));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q9));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q10));
+ vst1_u8(s2, vget_high_u8(q10));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ vs = v - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d20 = vld1_u8(us);
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ ud = u - 4;
+ vst1_u8(ud, vget_low_u8(q3));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q4));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q5));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q6));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q7));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q8));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q9));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q10));
+
+ vd = v - 4;
+ vst1_u8(vd, vget_high_u8(q3));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q4));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q5));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q6));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q7));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q8));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q9));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q10));
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm
deleted file mode 100644
index d7c590e15a2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm
+++ /dev/null
@@ -1,207 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad16x16_neon|
- EXPORT |vp8_sad16x8_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int src_stride
-; r2 unsigned char *ref_ptr
-; r3 int ref_stride
-|vp8_sad16x16_neon| PROC
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0]
- vld1.8 {q7}, [r2]
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;==============================
-;unsigned int vp8_sad16x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-|vp8_sad16x8_neon| PROC
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm
deleted file mode 100644
index 23ba6df93a4..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm
+++ /dev/null
@@ -1,209 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad8x8_neon|
- EXPORT |vp8_sad8x16_neon|
- EXPORT |vp8_sad4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; unsigned int vp8_sad8x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x8_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;============================
-;unsigned int vp8_sad8x16_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x16_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;===========================
-;unsigned int vp8_sad4x4_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad4x4_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 d1, d24
- vpaddl.u32 d0, d1
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
new file mode 100644
index 00000000000..6595ac0519b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+unsigned int vp8_sad8x8_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 7; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad8x16_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 15; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad4x4_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x2_t d1;
+ uint64x1_t d3;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 3; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ d1 = vpaddl_u16(vget_low_u16(q12));
+ d3 = vpaddl_u32(d1);
+
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vp8_sad16x16_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 15; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad16x8_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 7; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm
deleted file mode 100644
index fd7002e7a9e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm
+++ /dev/null
@@ -1,36 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_push_neon|
- EXPORT |vp8_pop_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp8_push_neon| PROC
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
-
-|vp8_pop_neon| PROC
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
-
- END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
deleted file mode 100644
index 67d2ab0150d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ /dev/null
@@ -1,139 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_idct4x4llm_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;*************************************************************
-;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
-; unsigned char *dst, int stride)
-;r0 short * input
-;r1 short * pred
-;r2 int pitch
-;r3 unsigned char dst
-;sp int stride
-;*************************************************************
-
-; static const int cospi8sqrt2minus1=20091;
-; static const int sinpi8sqrt2 =35468;
-; static const int rounding = 0;
-
-; Optimization note: The resulted data from dequantization are signed
-; 13-bit data that is in the range of [-4096, 4095]. This allows to
-; use "vqdmulh"(neon) instruction since it won't go out of range
-; (13+16+1=30bits<32bits). This instruction gives the high half
-; result of the multiplication that is needed in IDCT.
-
-|vp8_short_idct4x4llm_neon| PROC
- adr r12, idct_coeff
- vld1.16 {q1, q2}, [r0]
- vld1.16 {d0}, [r12]
-
- vswp d3, d4 ;q2(vp[4] vp[12])
- ldr r0, [sp] ; stride
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
- vqadd.s16 q4, q4, q2
-
- ;d6 - c1:temp1
- ;d7 - d1:temp2
- ;d8 - d1:temp1
- ;d9 - c1:temp2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vswp d3, d4
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vrshr.s16 d2, d2, #3
- vrshr.s16 d3, d3, #3
- vrshr.s16 d4, d4, #3
- vrshr.s16 d5, d5, #3
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- ; load prediction data
- vld1.32 d6[0], [r1], r2
- vld1.32 d6[1], [r1], r2
- vld1.32 d7[0], [r1], r2
- vld1.32 d7[1], [r1], r2
-
- ; add prediction and residual
- vaddw.u8 q1, q1, d6
- vaddw.u8 q2, q2, d7
-
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
-
- ; store to destination
- vst1.32 d1[0], [r3], r0
- vst1.32 d1[1], [r3], r0
- vst1.32 d2[0], [r3], r0
- vst1.32 d2[1], [r3], r0
-
- bx lr
-
- ENDP
-
-;-----------------
-
-idct_coeff
- DCD 0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 00000000000..373afa6ed35
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_neon(
+ int16_t *input,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint32x2_t d6u32 = vdup_n_u32(0);
+ uint8x8_t d1u8;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ uint16x8_t q1u16;
+ int16x8_t q1s16, q2s16, q3s16, q4s16;
+ int32x2x2_t v2tmp0, v2tmp1;
+ int16x4x2_t v2tmp2, v2tmp3;
+
+ d2 = vld1_s16(input);
+ d3 = vld1_s16(input + 4);
+ d4 = vld1_s16(input + 8);
+ d5 = vld1_s16(input + 12);
+
+ // 1st for loop
+ q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
+ q2s16 = vcombine_s16(d3, d5);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ // 2nd for loop
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+ q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+ q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+ // dc_only_idct_add
+ for (i = 0; i < 2; i++, q1s16 = q2s16) {
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+ pred_ptr += pred_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
+ vreinterpret_u8_u32(d6u32));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+ dst_ptr += dst_stride;
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
deleted file mode 100644
index 9fdafd3609e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ /dev/null
@@ -1,490 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict16x16_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter16_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
-; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
-; the result can be negtive. So, I treat the result as s16. But, since it is also possible
-; that the result can be a large positive number (> 2^15-1), which could be confused as a
-; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
-; which ensures that the result stays in s16 range. Finally, saturated add the result by
-; applying 3rd filter coeff. Same applys to other filter functions.
-
-|vp8_sixtap_predict16x16_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter16_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter16x16_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter16x16_only
-
- sub sp, sp, #336 ;reserve space on stack for temporary storage
- mov lr, sp
-
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #7 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (21x16)
-filt_blk2d_fp16x16_loop_neon
- vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
- vld1.u8 {d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q9, d7, d0
- vmull.u8 q10, d9, d0
- vmull.u8 q11, d10, d0
- vmull.u8 q12, d12, d0
- vmull.u8 q13, d13, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d9, d10, #1
- vext.8 d30, d12, d13, #1
-
- vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q12, d30, d1
-
- vext.8 d28, d7, d8, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d13, d14, #1
-
- vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q11, d29, d1
- vmlsl.u8 q13, d30, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d9, d10, #4
- vext.8 d30, d12, d13, #4
-
- vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q12, d30, d4
-
- vext.8 d28, d7, d8, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d13, d14, #4
-
- vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q11, d29, d4
- vmlsl.u8 q13, d30, d4
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d9, d10, #5
- vext.8 d30, d12, d13, #5
-
- vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q10, d29, d5
- vmlal.u8 q12, d30, d5
-
- vext.8 d28, d7, d8, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d13, d14, #5
-
- vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q11, d29, d5
- vmlal.u8 q13, d30, d5
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d9, d10, #2
- vext.8 d30, d12, d13, #2
-
- vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q10, d29, d2
- vmlal.u8 q12, d30, d2
-
- vext.8 d28, d7, d8, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d13, d14, #2
-
- vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q11, d29, d2
- vmlal.u8 q13, d30, d2
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d9, d10, #3
- vext.8 d30, d12, d13, #3
-
- vext.8 d15, d7, d8, #3
- vext.8 d31, d10, d11, #3
- vext.8 d6, d13, d14, #3
-
- vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
-
- vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q10, q5
- vqadd.s16 q12, q6
-
- vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q7, d31, d3
- vmull.u8 q3, d6, d3
-
- subs r2, r2, #1
-
- vqadd.s16 q9, q6
- vqadd.s16 q11, q7
- vqadd.s16 q13, q3
-
- vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q9, #7
- vqrshrun.s16 d8, q10, #7
- vqrshrun.s16 d9, q11, #7
- vqrshrun.s16 d10, q12, #7
- vqrshrun.s16 d11, q13, #7
-
- vst1.u8 {d6, d7, d8}, [lr]! ;store result
- vst1.u8 {d9, d10, d11}, [lr]!
-
- bne filt_blk2d_fp16x16_loop_neon
-
-;Second pass: 16x16
-;secondpass_filter - do first 8-columns and then second 8-columns
- add r3, r12, r3, lsl #5
- sub lr, lr, #336
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- mov r3, #2 ;loop counter
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- mov r2, #16
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_sp16x16_outloop_neon
- vld1.u8 {d18}, [lr], r2 ;load src data
- vld1.u8 {d19}, [lr], r2
- vld1.u8 {d20}, [lr], r2
- vld1.u8 {d21}, [lr], r2
- mov r12, #4 ;loop counter
- vld1.u8 {d22}, [lr], r2
-
-secondpass_inner_loop_neon
- vld1.u8 {d23}, [lr], r2 ;load src data
- vld1.u8 {d24}, [lr], r2
- vld1.u8 {d25}, [lr], r2
- vld1.u8 {d26}, [lr], r2
-
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r12, r12, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q9, q11
- vst1.u8 {d7}, [r4], r5
- vmov q10, q12
- vst1.u8 {d8}, [r4], r5
- vmov d22, d26
- vst1.u8 {d9}, [r4], r5
-
- bne secondpass_inner_loop_neon
-
- subs r3, r3, #1
- sub lr, lr, #336
- add lr, lr, #8
-
- sub r4, r4, r5, lsl #4
- add r4, r4, #8
-
- bne filt_blk2d_sp16x16_outloop_neon
-
- add sp, sp, #336
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_filter16x16_only
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #8 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (column-2)
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
- vld1.u8 {d9, d10, d11}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
-
- vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q7, d7, d0
- vmull.u8 q8, d9, d0
- vmull.u8 q9, d10, d0
-
- vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d21, d9, d10, #1
- vext.8 d22, d7, d8, #1
- vext.8 d23, d10, d11, #1
- vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d25, d9, d10, #4
- vext.8 d26, d7, d8, #4
- vext.8 d27, d10, d11, #4
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d9, d10, #5
-
- vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d21, d1
- vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q9, d23, d1
- vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d25, d4
- vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q9, d27, d4
- vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q8, d29, d5
-
- vext.8 d20, d7, d8, #5
- vext.8 d21, d10, d11, #5
- vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d23, d9, d10, #2
- vext.8 d24, d7, d8, #2
- vext.8 d25, d10, d11, #2
-
- vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d27, d9, d10, #3
- vext.8 d28, d7, d8, #3
- vext.8 d29, d10, d11, #3
-
- vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q9, d21, d5
- vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d23, d2
- vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q9, d25, d2
-
- vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q11, d27, d3
- vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q15, d29, d3
-
- vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q11
- vqadd.s16 q7, q12
- vqadd.s16 q9, q15
-
- subs r2, r2, #1
-
- vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q7, #7
- vqrshrun.s16 d8, q8, #7
- vqrshrun.s16 d9, q9, #7
-
- vst1.u8 {q3}, [r4], r5 ;store result
- vst1.u8 {q4}, [r4], r5
-
- bne filt_blk2d_fpo16x16_loop_neon
-
- pop {r4-r5,pc}
-
-;--------------------
-secondpass_filter16x16_only
-;Second pass: 16x16
- add r3, r12, r3, lsl #5
- sub r0, r0, r1, lsl #1
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- mov r3, #2 ;loop counter
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_spo16x16_outloop_neon
- vld1.u8 {d18}, [r0], r1 ;load src data
- vld1.u8 {d19}, [r0], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r0], r1
- mov r12, #4 ;loop counter
- vld1.u8 {d22}, [r0], r1
-
-secondpass_only_inner_loop_neon
- vld1.u8 {d23}, [r0], r1 ;load src data
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
-
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r12, r12, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q9, q11
- vst1.u8 {d7}, [r4], r5
- vmov q10, q12
- vst1.u8 {d8}, [r4], r5
- vmov d22, d26
- vst1.u8 {d9}, [r4], r5
-
- bne secondpass_only_inner_loop_neon
-
- subs r3, r3, #1
- sub r0, r0, r1, lsl #4
- sub r0, r0, r1, lsl #2
- sub r0, r0, r1
- add r0, r0, #8
-
- sub r4, r4, r5, lsl #4
- add r4, r4, #8
-
- bne filt_blk2d_spo16x16_outloop_neon
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
deleted file mode 100644
index a4222bc62c5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ /dev/null
@@ -1,422 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict4x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter4_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_sixtap_predict4x4_neon| PROC
- push {r4, lr}
-
- adr r12, filter4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter4x4_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter4x4_only
-
- vabs.s32 q12, q14 ;get abs(filer_parameters)
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;go back 2 columns of src data
- sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
-
-;First pass: output_height lines x output_width columns (9x4)
- vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
- vmull.u8 q8, d20, d5
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
- vmlal.u8 q8, d10, d0
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d20, d1
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d10, d4
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d20, d2
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q10, d10, d3
-
- vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
- vld1.u8 {q4}, [r0], r1
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
-
- vld1.u8 {q5}, [r0], r1
- vld1.u8 {q6}, [r0], r1
-
- vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d28, q8, #7
-
- ;First Pass on rest 5-line data
- vld1.u8 {q11}, [r0], r1
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
- vmull.u8 q8, d20, d5
- vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
-
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
- vmlal.u8 q8, d10, d0
- vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
-
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d20, d1
- vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
-
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d10, d4
- vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
-
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d20, d2
- vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q10, d10, d3
- vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
-
- add r3, r12, r3, lsl #5
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
- vqadd.s16 q12, q11
-
- vext.8 d23, d27, d28, #4
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
-
- vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d30, q8, #7
- vqrshrun.s16 d31, q12, #7
-
-;Second pass: 4x4
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vext.8 d24, d28, d29, #4
- vext.8 d25, d29, d30, #4
- vext.8 d26, d30, d31, #4
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
- vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d28, d0
-
- vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
- vmull.u8 q6, d26, d5
-
- vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d30, d4
-
- vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q6, d24, d1
-
- vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d29, d2
-
- vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
- vmlal.u8 q6, d25, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q6, q4
-
- vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d4, q6, #7
-
- vst1.32 {d3[0]}, [r4] ;store result
- vst1.32 {d3[1]}, [r0]
- vst1.32 {d4[0]}, [r1]
- vst1.32 {d4[1]}, [r2]
-
- pop {r4, pc}
-
-
-;---------------------
-firstpass_filter4x4_only
- vabs.s32 q12, q14 ;get abs(filer_parameters)
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;go back 2 columns of src data
-
-;First pass: output_height lines x output_width columns (4x4)
- vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
-
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
- vmull.u8 q8, d20, d5
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
- vmlal.u8 q8, d10, d0
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d20, d1
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d10, d4
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d20, d2
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q10, d10, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
-
- vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d28, q8, #7
-
- vst1.32 {d27[0]}, [r4] ;store result
- vst1.32 {d27[1]}, [r0]
- vst1.32 {d28[0]}, [r1]
- vst1.32 {d28[1]}, [r2]
-
- pop {r4, pc}
-
-
-;---------------------
-secondpass_filter4x4_only
- sub r0, r0, r1, lsl #1
- add r3, r12, r3, lsl #5
-
- vld1.32 {d27[0]}, [r0], r1 ;load src data
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.32 {d27[1]}, [r0], r1
- vabs.s32 q7, q5
- vld1.32 {d28[0]}, [r0], r1
- vabs.s32 q8, q6
- vld1.32 {d28[1]}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.32 {d29[0]}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.32 {d29[1]}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.32 {d30[0]}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.32 {d30[1]}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.32 {d31[0]}, [r0], r1
- vdup.8 d5, d16[4]
-
- vext.8 d23, d27, d28, #4
- vext.8 d24, d28, d29, #4
- vext.8 d25, d29, d30, #4
- vext.8 d26, d30, d31, #4
-
- vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d28, d0
-
- vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
- vmull.u8 q6, d26, d5
-
- vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d30, d4
-
- vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q6, d24, d1
-
- vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d29, d2
-
- vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
- vmlal.u8 q6, d25, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q6, q4
-
- vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d4, q6, #7
-
- vst1.32 {d3[0]}, [r4] ;store result
- vst1.32 {d3[1]}, [r0]
- vst1.32 {d4[0]}, [r1]
- vst1.32 {d4[1]}, [r2]
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
deleted file mode 100644
index a57ec015f2c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ /dev/null
@@ -1,473 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_sixtap_predict8x4_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter8_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter8x4_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter8x4_only
-
- sub sp, sp, #32 ;reserve space on stack for temporary storage
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- mov lr, sp
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
-
-;First pass: output_height lines x output_width columns (9x8)
- vld1.u8 {q3}, [r0], r1 ;load src data
- vdup.8 d3, d25[4]
- vld1.u8 {q4}, [r0], r1
- vdup.8 d4, d26[0]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d5, d26[4]
- vld1.u8 {q6}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {d22}, [lr]! ;store result
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {d23}, [lr]!
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {d24}, [lr]!
- vld1.u8 {q7}, [r0], r1
- vst1.u8 {d25}, [lr]!
-
- ;first_pass filtering on the rest 5-line data
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
- vmull.u8 q11, d12, d0
- vmull.u8 q12, d14, d0
-
- vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d28, d8, d9, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d12, d13, #1
- vext.8 d31, d14, d15, #1
-
- vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q9, d28, d1
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q11, d30, d1
- vmlsl.u8 q12, d31, d1
-
- vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d28, d8, d9, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d12, d13, #4
- vext.8 d31, d14, d15, #4
-
- vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q9, d28, d4
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q11, d30, d4
- vmlsl.u8 q12, d31, d4
-
- vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d28, d8, d9, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d12, d13, #2
- vext.8 d31, d14, d15, #2
-
- vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q9, d28, d2
- vmlal.u8 q10, d29, d2
- vmlal.u8 q11, d30, d2
- vmlal.u8 q12, d31, d2
-
- vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d28, d8, d9, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d12, d13, #5
- vext.8 d31, d14, d15, #5
-
- vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q9, d28, d5
- vmlal.u8 q10, d29, d5
- vmlal.u8 q11, d30, d5
- vmlal.u8 q12, d31, d5
-
- vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d28, d8, d9, #3
- vext.8 d29, d10, d11, #3
- vext.8 d30, d12, d13, #3
- vext.8 d31, d14, d15, #3
-
- vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d28, d3
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
- vmull.u8 q7, d31, d3
-
- vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q9, q4
- vqadd.s16 q10, q5
- vqadd.s16 q11, q6
- vqadd.s16 q12, q7
-
- vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d27, q9, #7
- vqrshrun.s16 d28, q10, #7
- vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
- vqrshrun.s16 d30, q12, #7
-
-;Second pass: 8x4
-;secondpass_filter
- add r3, r12, r3, lsl #5
- sub lr, lr, #32
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.u8 {q11}, [lr]!
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vld1.u8 {q12}, [lr]!
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
- vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d23, d0
- vmull.u8 q5, d24, d0
- vmull.u8 q6, d25, d0
-
- vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d24, d1
- vmlsl.u8 q5, d25, d1
- vmlsl.u8 q6, d26, d1
-
- vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d27, d4
- vmlsl.u8 q5, d28, d4
- vmlsl.u8 q6, d29, d4
-
- vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d25, d2
- vmlal.u8 q5, d26, d2
- vmlal.u8 q6, d27, d2
-
- vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d28, d5
- vmlal.u8 q5, d29, d5
- vmlal.u8 q6, d30, d5
-
- vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d26, d3
- vmull.u8 q9, d27, d3
- vmull.u8 q10, d28, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vst1.u8 {d7}, [r4], r5
- vst1.u8 {d8}, [r4], r5
- vst1.u8 {d9}, [r4], r5
-
- add sp, sp, #32
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_filter8x4_only
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First pass: output_height lines x output_width columns (4x8)
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [r4], r5 ;store result
- vst1.u8 {d23}, [r4], r5
- vst1.u8 {d24}, [r4], r5
- vst1.u8 {d25}, [r4], r5
-
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x4_only
-;Second pass: 8x4
- add r3, r12, r3, lsl #5
- sub r0, r0, r1, lsl #1
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vld1.u8 {d22}, [r0], r1
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.u8 {d25}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.u8 {d26}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.u8 {d27}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.u8 {d28}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.u8 {d29}, [r0], r1
- vdup.8 d5, d16[4]
- vld1.u8 {d30}, [r0], r1
-
- vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d23, d0
- vmull.u8 q5, d24, d0
- vmull.u8 q6, d25, d0
-
- vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d24, d1
- vmlsl.u8 q5, d25, d1
- vmlsl.u8 q6, d26, d1
-
- vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d27, d4
- vmlsl.u8 q5, d28, d4
- vmlsl.u8 q6, d29, d4
-
- vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d25, d2
- vmlal.u8 q5, d26, d2
- vmlal.u8 q6, d27, d2
-
- vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d28, d5
- vmlal.u8 q5, d29, d5
- vmlal.u8 q6, d30, d5
-
- vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d26, d3
- vmull.u8 q9, d27, d3
- vmull.u8 q10, d28, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vst1.u8 {d7}, [r4], r5
- vst1.u8 {d8}, [r4], r5
- vst1.u8 {d9}, [r4], r5
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
deleted file mode 100644
index 00ed5aeefe3..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ /dev/null
@@ -1,524 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_sixtap_predict8x8_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter8_coeff
-
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter8x8_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter8x8_only
-
- sub sp, sp, #64 ;reserve space on stack for temporary storage
- mov lr, sp
-
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #2 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
-
-;First pass: output_height lines x output_width columns (13x8)
- vld1.u8 {q3}, [r0], r1 ;load src data
- vdup.8 d3, d25[4]
- vld1.u8 {q4}, [r0], r1
- vdup.8 d4, d26[0]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d5, d26[4]
- vld1.u8 {q6}, [r0], r1
-
-filt_blk2d_fp8x8_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- subs r2, r2, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [lr]! ;store result
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {d23}, [lr]!
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {d24}, [lr]!
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {d25}, [lr]!
-
- bne filt_blk2d_fp8x8_loop_neon
-
- ;first_pass filtering on the rest 5-line data
- ;vld1.u8 {q3}, [r0], r1 ;load src data
- ;vld1.u8 {q4}, [r0], r1
- ;vld1.u8 {q5}, [r0], r1
- ;vld1.u8 {q6}, [r0], r1
- vld1.u8 {q7}, [r0], r1
-
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
- vmull.u8 q11, d12, d0
- vmull.u8 q12, d14, d0
-
- vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d28, d8, d9, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d12, d13, #1
- vext.8 d31, d14, d15, #1
-
- vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q9, d28, d1
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q11, d30, d1
- vmlsl.u8 q12, d31, d1
-
- vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d28, d8, d9, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d12, d13, #4
- vext.8 d31, d14, d15, #4
-
- vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q9, d28, d4
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q11, d30, d4
- vmlsl.u8 q12, d31, d4
-
- vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d28, d8, d9, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d12, d13, #2
- vext.8 d31, d14, d15, #2
-
- vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q9, d28, d2
- vmlal.u8 q10, d29, d2
- vmlal.u8 q11, d30, d2
- vmlal.u8 q12, d31, d2
-
- vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d28, d8, d9, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d12, d13, #5
- vext.8 d31, d14, d15, #5
-
- vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q9, d28, d5
- vmlal.u8 q10, d29, d5
- vmlal.u8 q11, d30, d5
- vmlal.u8 q12, d31, d5
-
- vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d28, d8, d9, #3
- vext.8 d29, d10, d11, #3
- vext.8 d30, d12, d13, #3
- vext.8 d31, d14, d15, #3
-
- vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d28, d3
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
- vmull.u8 q7, d31, d3
-
- vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q9, q4
- vqadd.s16 q10, q5
- vqadd.s16 q11, q6
- vqadd.s16 q12, q7
-
- add r3, r12, r3, lsl #5
-
- vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
- sub lr, lr, #64
- vqrshrun.s16 d27, q9, #7
- vld1.u8 {q9}, [lr]! ;load intermediate data from stack
- vqrshrun.s16 d28, q10, #7
- vld1.u8 {q10}, [lr]!
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
-
- vqrshrun.s16 d29, q11, #7
- vld1.u8 {q11}, [lr]!
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vqrshrun.s16 d30, q12, #7
- vld1.u8 {q12}, [lr]!
-
-;Second pass: 8x8
- mov r3, #2 ;loop counter
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_sp8x8_loop_neon
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r3, r3, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vmov q9, q11
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q10, q12
- vst1.u8 {d7}, [r4], r5
- vmov q11, q13
- vst1.u8 {d8}, [r4], r5
- vmov q12, q14
- vst1.u8 {d9}, [r4], r5
- vmov d26, d30
-
- bne filt_blk2d_sp8x8_loop_neon
-
- add sp, sp, #64
- pop {r4-r5,pc}
-
-;---------------------
-firstpass_filter8x8_only
- ;add r2, r12, r2, lsl #5 ;calculate filter location
- ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #2 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First pass: output_height lines x output_width columns (8x8)
-filt_blk2d_fpo8x8_loop_neon
- vld1.u8 {q3}, [r0], r1 ;load src data
- vld1.u8 {q4}, [r0], r1
- vld1.u8 {q5}, [r0], r1
- vld1.u8 {q6}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
- ;
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- subs r2, r2, #1
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [r4], r5 ;store result
- vst1.u8 {d23}, [r4], r5
- vst1.u8 {d24}, [r4], r5
- vst1.u8 {d25}, [r4], r5
-
- bne filt_blk2d_fpo8x8_loop_neon
-
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x8_only
- sub r0, r0, r1, lsl #1
- add r3, r12, r3, lsl #5
-
- vld1.u8 {d18}, [r0], r1 ;load src data
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.u8 {d19}, [r0], r1
- vabs.s32 q7, q5
- vld1.u8 {d20}, [r0], r1
- vabs.s32 q8, q6
- vld1.u8 {d21}, [r0], r1
- mov r3, #2 ;loop counter
- vld1.u8 {d22}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.u8 {d23}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.u8 {d24}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.u8 {d25}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.u8 {d26}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.u8 {d27}, [r0], r1
- vdup.8 d5, d16[4]
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
-;Second pass: 8x8
-filt_blk2d_spo8x8_loop_neon
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r3, r3, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vmov q9, q11
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q10, q12
- vst1.u8 {d7}, [r4], r5
- vmov q11, q13
- vst1.u8 {d8}, [r4], r5
- vmov q12, q14
- vst1.u8 {d9}, [r4], r5
- vmov d26, d30
-
- bne filt_blk2d_spo8x8_loop_neon
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 00000000000..7a4d9e05128
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1752 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+ {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
+ {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
+ {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -9, 93, 50, -6, 0, 0, 0},
+ {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
+ {0, -6, 50, 93, -9, 0, 0, 0},
+ {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -1, 12, 123, -6, 0, 0, 0},
+};
+
+void vp8_sixtap_predict4x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
+ uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint32x2_t d27u32, d28u32, d29u32, d30u32, d31u32;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
+ uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
+ uint32x2x2_t d0u32x2, d1u32x2;
+
+ if (xoffset == 0) { // secondpass_filter4x4_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
+ src += src_pixels_per_line;
+ d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
+ src += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
+ src += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
+ src += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
+ src += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
+ src += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
+ src += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
+ src += src_pixels_per_line;
+ d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
+
+ d27u8 = vreinterpret_u8_u32(d27u32);
+ d28u8 = vreinterpret_u8_u32(d28u32);
+ d29u8 = vreinterpret_u8_u32(d29u32);
+ d30u8 = vreinterpret_u8_u32(d30u32);
+ d31u8 = vreinterpret_u8_u32(d31u32);
+
+ d23u8 = vext_u8(d27u8, d28u8, 4);
+ d24u8 = vext_u8(d28u8, d29u8, 4);
+ d25u8 = vext_u8(d29u8, d30u8, 4);
+ d26u8 = vext_u8(d30u8, d31u8, 4);
+
+ q3u16 = vmull_u8(d27u8, d0u8);
+ q4u16 = vmull_u8(d28u8, d0u8);
+ q5u16 = vmull_u8(d25u8, d5u8);
+ q6u16 = vmull_u8(d26u8, d5u8);
+
+ q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+ q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+ q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+
+ q5s16 = vqaddq_s16(q5s16, q3s16);
+ q6s16 = vqaddq_s16(q6s16, q4s16);
+
+ d3u8 = vqrshrun_n_s16(q5s16, 7);
+ d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ // vswp here
+ q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+ q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
+ vreinterpret_u32_u8(d19u8));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
+ vreinterpret_u32_u8(d21u8));
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+
+ // keep original src data in q4 q6
+ q4u64 = vreinterpretq_u64_u8(q3u8);
+ q6u64 = vreinterpretq_u64_u8(q5u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
+ vreinterpret_u32_u8(vget_high_u8(q3u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
+ vreinterpret_u32_u8(vget_high_u8(q5u8)));
+ q9u64 = vshrq_n_u64(q4u64, 8);
+ q10u64 = vshrq_n_u64(q6u64, 8);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 32);
+ q5u64 = vshrq_n_u64(q6u64, 32);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u64 = vshrq_n_u64(q4u64, 16);
+ q10u64 = vshrq_n_u64(q6u64, 16);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 24);
+ q5u64 = vshrq_n_u64(q6u64, 24);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+ q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q7s16 = vqaddq_s16(q7s16, q9s16);
+ q8s16 = vqaddq_s16(q8s16, q10s16);
+
+ d27u8 = vqrshrun_n_s16(q7s16, 7);
+ d28u8 = vqrshrun_n_s16(q8s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q11u8 = vld1q_u8(src);
+
+ d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ // vswp here
+ q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+ q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
+ vreinterpret_u32_u8(d19u8));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
+ vreinterpret_u32_u8(d21u8));
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+ q12u16 = vmull_u8(d31u8, d5u8);
+
+ q4u64 = vreinterpretq_u64_u8(q3u8);
+ q6u64 = vreinterpretq_u64_u8(q5u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
+ vreinterpret_u32_u8(vget_high_u8(q3u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
+ vreinterpret_u32_u8(vget_high_u8(q5u8)));
+ q9u64 = vshrq_n_u64(q4u64, 8);
+ q10u64 = vshrq_n_u64(q6u64, 8);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+ q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 32);
+ q5u64 = vshrq_n_u64(q6u64, 32);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u64 = vshrq_n_u64(q4u64, 16);
+ q10u64 = vshrq_n_u64(q6u64, 16);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 24);
+ q5u64 = vshrq_n_u64(q6u64, 24);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
+ q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+ q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+ q11u16 = vmull_u8(d31u8, d3u8);
+
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q7s16 = vqaddq_s16(q7s16, q9s16);
+ q8s16 = vqaddq_s16(q8s16, q10s16);
+ q12s16 = vqaddq_s16(q12s16, q11s16);
+
+ d29u8 = vqrshrun_n_s16(q7s16, 7);
+ d30u8 = vqrshrun_n_s16(q8s16, 7);
+ d31u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 4x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ d23u8 = vext_u8(d27u8, d28u8, 4);
+ d24u8 = vext_u8(d28u8, d29u8, 4);
+ d25u8 = vext_u8(d29u8, d30u8, 4);
+ d26u8 = vext_u8(d30u8, d31u8, 4);
+
+ q3u16 = vmull_u8(d27u8, d0u8);
+ q4u16 = vmull_u8(d28u8, d0u8);
+ q5u16 = vmull_u8(d25u8, d5u8);
+ q6u16 = vmull_u8(d26u8, d5u8);
+
+ q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+ q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+ q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+
+ q5s16 = vqaddq_s16(q5s16, q3s16);
+ q6s16 = vqaddq_s16(q6s16, q4s16);
+
+ d3u8 = vqrshrun_n_s16(q5s16, 7);
+ d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+ return;
+}
+
+void vp8_sixtap_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+ if (xoffset == 0) { // secondpass_filter8x4_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ src += src_pixels_per_line;
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+}
+
+void vp8_sixtap_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *tmpp;
+ unsigned char tmp[64];
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ tmpp = tmp;
+ for (i = 2; i > 0; i--) {
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ dst_ptr += dst_pitch;
+ } else {
+ vst1_u8(tmpp, d22u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d23u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d24u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d25u8);
+ tmpp += 8;
+ }
+ }
+ if (yoffset == 0)
+ return;
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x8
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ tmpp = tmp;
+ q9u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q10u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q12u8 = vld1q_u8(tmpp);
+
+ d18u8 = vget_low_u8(q9u8);
+ d19u8 = vget_high_u8(q9u8);
+ d20u8 = vget_low_u8(q10u8);
+ d21u8 = vget_high_u8(q10u8);
+ d22u8 = vget_low_u8(q11u8);
+ d23u8 = vget_high_u8(q11u8);
+ d24u8 = vget_low_u8(q12u8);
+ d25u8 = vget_high_u8(q12u8);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+}
+
+void vp8_sixtap_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *src_tmp, *dst, *tmpp;
+ unsigned char tmp[336];
+ int i, j;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+ uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+ uint8x8_t d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint8x16_t q3u8, q4u8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+ uint16x8_t q11u16, q12u16, q13u16, q15u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+ int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src_tmp = src_ptr - src_pixels_per_line * 2;
+ for (i = 0; i < 2; i++) {
+ src = src_tmp + i * 8;
+ dst = dst_ptr + i * 8;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ dst = dst_ptr;
+ for (i = 0; i < 8; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+
+ q6u16 = vmull_u8(d6u8, d0u8);
+ q7u16 = vmull_u8(d7u8, d0u8);
+ q8u16 = vmull_u8(d9u8, d0u8);
+ q9u16 = vmull_u8(d10u8, d0u8);
+
+ d20u8 = vext_u8(d6u8, d7u8, 1);
+ d21u8 = vext_u8(d9u8, d10u8, 1);
+ d22u8 = vext_u8(d7u8, d8u8, 1);
+ d23u8 = vext_u8(d10u8, d11u8, 1);
+ d24u8 = vext_u8(d6u8, d7u8, 4);
+ d25u8 = vext_u8(d9u8, d10u8, 4);
+ d26u8 = vext_u8(d7u8, d8u8, 4);
+ d27u8 = vext_u8(d10u8, d11u8, 4);
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+
+ q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+ q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+ q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+ d20u8 = vext_u8(d7u8, d8u8, 5);
+ d21u8 = vext_u8(d10u8, d11u8, 5);
+ d22u8 = vext_u8(d6u8, d7u8, 2);
+ d23u8 = vext_u8(d9u8, d10u8, 2);
+ d24u8 = vext_u8(d7u8, d8u8, 2);
+ d25u8 = vext_u8(d10u8, d11u8, 2);
+ d26u8 = vext_u8(d6u8, d7u8, 3);
+ d27u8 = vext_u8(d9u8, d10u8, 3);
+ d28u8 = vext_u8(d7u8, d8u8, 3);
+ d29u8 = vext_u8(d10u8, d11u8, 3);
+
+ q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+ q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+ q10u16 = vmull_u8(d26u8, d3u8);
+ q11u16 = vmull_u8(d27u8, d3u8);
+ q12u16 = vmull_u8(d28u8, d3u8);
+ q15u16 = vmull_u8(d29u8, d3u8);
+
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q15s16 = vreinterpretq_s16_u16(q15u16);
+
+ q6s16 = vqaddq_s16(q6s16, q10s16);
+ q8s16 = vqaddq_s16(q8s16, q11s16);
+ q7s16 = vqaddq_s16(q7s16, q12s16);
+ q9s16 = vqaddq_s16(q9s16, q15s16);
+
+ d6u8 = vqrshrun_n_s16(q6s16, 7);
+ d7u8 = vqrshrun_n_s16(q7s16, 7);
+ d8u8 = vqrshrun_n_s16(q8s16, 7);
+ d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+ vst1q_u8(dst, q3u8);
+ dst += dst_pitch;
+ vst1q_u8(dst, q4u8);
+ dst += dst_pitch;
+ }
+ return;
+ }
+
+ src = src_ptr - 2 - src_pixels_per_line * 2;
+ tmpp = tmp;
+ for (i = 0; i < 7; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d12u8 = vld1_u8(src);
+ d13u8 = vld1_u8(src + 8);
+ d14u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q8u16 = vmull_u8(d6u8, d0u8);
+ q9u16 = vmull_u8(d7u8, d0u8);
+ q10u16 = vmull_u8(d9u8, d0u8);
+ q11u16 = vmull_u8(d10u8, d0u8);
+ q12u16 = vmull_u8(d12u8, d0u8);
+ q13u16 = vmull_u8(d13u8, d0u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 1);
+ d29u8 = vext_u8(d9u8, d10u8, 1);
+ d30u8 = vext_u8(d12u8, d13u8, 1);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+ d28u8 = vext_u8(d7u8, d8u8, 1);
+ d29u8 = vext_u8(d10u8, d11u8, 1);
+ d30u8 = vext_u8(d13u8, d14u8, 1);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 4);
+ d29u8 = vext_u8(d9u8, d10u8, 4);
+ d30u8 = vext_u8(d12u8, d13u8, 4);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+ d28u8 = vext_u8(d7u8, d8u8, 4);
+ d29u8 = vext_u8(d10u8, d11u8, 4);
+ d30u8 = vext_u8(d13u8, d14u8, 4);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+ d30u8 = vext_u8(d12u8, d13u8, 5);
+ q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+ d28u8 = vext_u8(d7u8, d8u8, 5);
+ d29u8 = vext_u8(d10u8, d11u8, 5);
+ d30u8 = vext_u8(d13u8, d14u8, 5);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 2);
+ d29u8 = vext_u8(d9u8, d10u8, 2);
+ d30u8 = vext_u8(d12u8, d13u8, 2);
+ q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+ d28u8 = vext_u8(d7u8, d8u8, 2);
+ d29u8 = vext_u8(d10u8, d11u8, 2);
+ d30u8 = vext_u8(d13u8, d14u8, 2);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 3);
+ d29u8 = vext_u8(d9u8, d10u8, 3);
+ d30u8 = vext_u8(d12u8, d13u8, 3);
+ d15u8 = vext_u8(d7u8, d8u8, 3);
+ d31u8 = vext_u8(d10u8, d11u8, 3);
+ d6u8 = vext_u8(d13u8, d14u8, 3);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q12s16 = vqaddq_s16(q12s16, q6s16);
+
+ q6u16 = vmull_u8(d15u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+ q3u16 = vmull_u8(d6u8, d3u8);
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q13s16 = vreinterpretq_s16_u16(q13u16);
+ q9s16 = vqaddq_s16(q9s16, q6s16);
+ q11s16 = vqaddq_s16(q11s16, q7s16);
+ q13s16 = vqaddq_s16(q13s16, q3s16);
+
+ d6u8 = vqrshrun_n_s16(q8s16, 7);
+ d7u8 = vqrshrun_n_s16(q9s16, 7);
+ d8u8 = vqrshrun_n_s16(q10s16, 7);
+ d9u8 = vqrshrun_n_s16(q11s16, 7);
+ d10u8 = vqrshrun_n_s16(q12s16, 7);
+ d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+ vst1_u8(tmpp, d6u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d7u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d8u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d9u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d10u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d11u8);
+ tmpp += 8;
+ }
+
+ // Second pass: 16x16
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ for (i = 0; i < 2; i++) {
+ dst = dst_ptr + 8 * i;
+ tmpp = tmp + 8 * i;
+ d18u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d19u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d20u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d21u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d22u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d24u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d25u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d26u8 = vld1_u8(tmpp);
+ tmpp += 16;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm
deleted file mode 100644
index e3b48327d3f..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm
+++ /dev/null
@@ -1,276 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance16x16_neon|
- EXPORT |vp8_variance16x8_neon|
- EXPORT |vp8_variance8x16_neon|
- EXPORT |vp8_variance8x8_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #8
-
-variance16x16_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
- ;the results into the elements of the destination vector. The explanation
- ;in ARM guide is wrong.
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- ;vmov.32 r0, d0[0] ;this instruction costs a lot
- ;vmov.32 r1, d1[0]
- ;mul r0, r0, r0
- ;str r1, [r12]
- ;sub r0, r1, r0, lsr #8
-
- ; while sum is signed, sum * sum is always positive and must be treated as
- ; unsigned to avoid propagating the sign bit.
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;================================
-;unsigned int vp8_variance16x8_c(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *sse)
-|vp8_variance16x8_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #4
-
-variance16x8_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance16x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.u32 d10, d10, #7
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;=================================
-;unsigned int vp8_variance8x16_c(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *sse)
-
-|vp8_variance8x16_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #8
-
-variance8x16_neon_loop
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d2, d6
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
-
- bne variance8x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.u32 d10, d10, #7
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;==================================
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-variance8x8_neon_loop
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d1}, [r0], r1
- vld1.8 {d5}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
- vld1.8 {d3}, [r0], r1
- vld1.8 {d7}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.u32 d10, d10, #6
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c
new file mode 100644
index 00000000000..afd2dc3d1e2
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+unsigned int vp8_variance16x16_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) {
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance16x8_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) { // variance16x8_neon_loop
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ q2u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance8x16_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d2u8, d4u8, d6u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint16x8_t q11u16, q12u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 8; i++) { // variance8x16_neon_loop
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ __builtin_prefetch(src_ptr);
+
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ __builtin_prefetch(ref_ptr);
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d2u8, d6u8);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance8x8_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64;
+ uint16x8_t q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 2; i++) { // variance8x8_neon_loop
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d1u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+ d3u8 = vld1_u8(src_ptr);
+ src_ptr += source_stride;
+
+ d4u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d5u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d6u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ d7u8 = vld1_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q11u16 = vsubl_u8(d0u8, d4u8);
+ q12u16 = vsubl_u8(d1u8, d5u8);
+ q13u16 = vsubl_u8(d2u8, d6u8);
+ q14u16 = vsubl_u8(d3u8, d7u8);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+ d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
index 9d22c52521c..adc5b7e3a78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -31,11 +31,12 @@ bilinear_taps_coeff
|vp8_sub_pixel_variance16x16_neon_func| PROC
push {r4-r6, lr}
+ vpush {d8-d15}
adr r12, bilinear_taps_coeff
- ldr r4, [sp, #16] ;load *dst_ptr from stack
- ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
- ldr r6, [sp, #24] ;load *sse from stack
+ ldr r4, [sp, #80] ;load *dst_ptr from stack
+ ldr r5, [sp, #84] ;load dst_pixels_per_line from stack
+ ldr r6, [sp, #88] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
@@ -416,6 +417,7 @@ sub_pixel_variance16x16_neon_loop
add sp, sp, #528
vmov.32 r0, d0[0] ;return
+ vpop {d8-d15}
pop {r4-r6,pc}
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 155be4fc54b..b0829af7547 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -31,9 +31,10 @@
;================================================
|vp8_variance_halfpixvar16x16_h_neon| PROC
push {lr}
+ vpush {d8-d15}
mov r12, #4 ;loop counter
- ldr lr, [sp, #4] ;load *sse from stack
+ ldr lr, [sp, #68] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
@@ -116,6 +117,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
+
+ vpop {d8-d15}
pop {pc}
ENDP
@@ -131,11 +134,12 @@ vp8_filt_fpo16x16s_4_0_loop_neon
;================================================
|vp8_variance_halfpixvar16x16_v_neon| PROC
push {lr}
+ vpush {d8-d15}
mov r12, #4 ;loop counter
vld1.u8 {q0}, [r0], r1 ;load src data
- ldr lr, [sp, #4] ;load *sse from stack
+ ldr lr, [sp, #68] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
@@ -212,6 +216,8 @@ vp8_filt_spo16x16s_0_4_loop_neon
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
+
+ vpop {d8-d15}
pop {pc}
ENDP
@@ -227,10 +233,11 @@ vp8_filt_spo16x16s_0_4_loop_neon
;================================================
|vp8_variance_halfpixvar16x16_hv_neon| PROC
push {lr}
+ vpush {d8-d15}
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- ldr lr, [sp, #4] ;load *sse from stack
+ ldr lr, [sp, #68] ;load *sse from stack
vmov.i8 q13, #0 ;q8 - sum
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
@@ -331,6 +338,8 @@ vp8_filt16x16s_4_4_loop_neon
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
+
+ vpop {d8-d15}
pop {pc}
ENDP
@@ -349,10 +358,11 @@ vp8_filt16x16s_4_4_loop_neon
|vp8_sub_pixel_variance16x16s_neon| PROC
push {r4, lr}
+ vpush {d8-d15}
- ldr r4, [sp, #8] ;load *dst_ptr from stack
- ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #16] ;load *sse from stack
+ ldr r4, [sp, #72] ;load *dst_ptr from stack
+ ldr r12, [sp, #76] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #80] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16s_only
@@ -566,6 +576,7 @@ sub_pixel_variance16x16s_neon_loop
add sp, sp, #256
vmov.32 r0, d0[0] ;return
+ vpop {d8-d15}
pop {r4, pc}
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
index f6b6847537f..9d9f9e0772a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -26,11 +26,12 @@
|vp8_sub_pixel_variance8x8_neon| PROC
push {r4-r5, lr}
+ vpush {d8-d15}
adr r12, bilinear_taps_coeff
- ldr r4, [sp, #12] ;load *dst_ptr from stack
- ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #20] ;load *sse from stack
+ ldr r4, [sp, #76] ;load *dst_ptr from stack
+ ldr r5, [sp, #80] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #84] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
@@ -210,6 +211,8 @@ sub_pixel_variance8x8_neon_loop
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
+
+ vpop {d8-d15}
pop {r4-r5, pc}
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/reconintra_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/reconintra_arm.c
index 2874896e8e2..e55a33cbb18 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/reconintra_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/reconintra_arm.c
@@ -14,7 +14,7 @@
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
-#if HAVE_NEON
+#if HAVE_NEON_ASM
extern void vp8_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c
index 467a509420e..e3f7083b611 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
#endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON_ASM
extern unsigned int vp8_sub_pixel_variance16x16_neon_func
(
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
index f7ff5776352..ea1a6a4adfd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_BLOCKD_H
-#define __INC_BLOCKD_H
+#ifndef VP8_COMMON_BLOCKD_H_
+#define VP8_COMMON_BLOCKD_H_
void vpx_log(const char *format, ...);
@@ -20,6 +20,10 @@ void vpx_log(const char *format, ...);
#include "treecoder.h"
#include "vpx_ports/mem.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/*#define DCPRED 1*/
#define DCPREDSIMTHRESH 0
#define DCPREDCNTTHRESH 3
@@ -297,4 +301,8 @@ typedef struct macroblockd
extern void vp8_build_block_doffsets(MACROBLOCKD *x);
extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
-#endif /* __INC_BLOCKD_H */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_BLOCKD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/coefupdateprobs.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/coefupdateprobs.h
index 9e194dc9a4d..d96a19e7478 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/coefupdateprobs.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/coefupdateprobs.h
@@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
+#define VP8_COMMON_COEFUPDATEPROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
/* Update probabilities for the nodes in the token entropy tree.
Generated file included by entropy.c */
@@ -183,3 +189,9 @@ const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTE
},
},
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
index 2cc1c544cdf..17262d6983c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
@@ -9,8 +9,8 @@
*/
-#ifndef common_h
-#define common_h 1
+#ifndef VP8_COMMON_COMMON_H_
+#define VP8_COMMON_COMMON_H_
#include <assert.h>
@@ -18,6 +18,13 @@
#include "vpx_mem/vpx_mem.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
/* Only need this for fixed-size arrays, for structs just assign. */
#define vp8_copy( Dest, Src) { \
@@ -37,4 +44,8 @@
#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
-#endif /* common_h */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/default_coef_probs.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/default_coef_probs.h
index 0d195636bcb..4d69e4be664 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/default_coef_probs.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/default_coef_probs.h
@@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
/*Generated file, included by entropy.c*/
@@ -186,3 +192,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES]
}
}
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.h
index 5389bc1de4e..a90bab4bac2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.h
@@ -9,12 +9,16 @@
*/
-#ifndef __INC_ENTROPY_H
-#define __INC_ENTROPY_H
+#ifndef VP8_COMMON_ENTROPY_H_
+#define VP8_COMMON_ENTROPY_H_
#include "treecoder.h"
#include "blockd.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Coefficient token alphabet */
#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
@@ -98,4 +102,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
void vp8_coef_tree_initialize(void);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_ENTROPY_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.h
index 1df0f641e49..81bdfc4b8bd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.h
@@ -9,12 +9,16 @@
*/
-#ifndef __INC_ENTROPYMODE_H
-#define __INC_ENTROPYMODE_H
+#ifndef VP8_COMMON_ENTROPYMODE_H_
+#define VP8_COMMON_ENTROPYMODE_H_
#include "onyxc_int.h"
#include "treecoder.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef enum
{
SUBMVREF_NORMAL,
@@ -77,4 +81,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x);
void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_ENTROPYMODE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymv.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymv.h
index 2db1e385bae..42840d58ad2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymv.h
@@ -9,11 +9,15 @@
*/
-#ifndef __INC_ENTROPYMV_H
-#define __INC_ENTROPYMV_H
+#ifndef VP8_COMMON_ENTROPYMV_H_
+#define VP8_COMMON_ENTROPYMV_H_
#include "treecoder.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
enum
{
mv_max = 1023, /* max absolute value of a MV component */
@@ -41,4 +45,8 @@ typedef struct mv_context
extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_ENTROPYMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.h
index 74a0b177d94..068f4ac5236 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.h
@@ -9,11 +9,15 @@
*/
-#ifndef __INC_EXTEND_H
-#define __INC_EXTEND_H
+#ifndef VP8_COMMON_EXTEND_H_
+#define VP8_COMMON_EXTEND_H_
#include "vpx_scale/yv12config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
@@ -22,4 +26,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
int srcy, int srcx,
int srch, int srcw);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_EXTEND_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.h
index ccda7c8d020..cfba775fce4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.h
@@ -9,11 +9,15 @@
*/
-#ifndef FILTER_H
-#define FILTER_H
+#ifndef VP8_COMMON_FILTER_H_
+#define VP8_COMMON_FILTER_H_
#include "vpx_ports/mem.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define BLOCK_HEIGHT_WIDTH 4
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
@@ -21,4 +25,8 @@
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_FILTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/findnearmv.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/findnearmv.h
index c60e463614b..3c8c0506f67 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/findnearmv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/findnearmv.h
@@ -9,14 +9,18 @@
*/
-#ifndef __INC_FINDNEARMV_H
-#define __INC_FINDNEARMV_H
+#ifndef VP8_COMMON_FINDNEARMV_H_
+#define VP8_COMMON_FINDNEARMV_H_
#include "mv.h"
#include "blockd.h"
#include "modecont.h"
#include "treecoder.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
const int *ref_frame_sign_bias)
@@ -179,4 +183,8 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi
return (cur_mb->bmi + b - 4)->as_mode;
}
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_FINDNEARMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/header.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/header.h
index 3e98eeb3c34..e27bca16bd7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/header.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/header.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_HEADER_H
-#define __INC_HEADER_H
+#ifndef VP8_COMMON_HEADER_H_
+#define VP8_COMMON_HEADER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
/* 24 bits total */
typedef struct
@@ -40,4 +44,8 @@ typedef struct
#endif
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_HEADER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/invtrans.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/invtrans.h
index 9262640d570..affe57e3d6e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/invtrans.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/invtrans.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_INVTRANS_H
-#define __INC_INVTRANS_H
+#ifndef VP8_COMMON_INVTRANS_H_
+#define VP8_COMMON_INVTRANS_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
@@ -21,6 +21,10 @@
#include "vpx_mem/vpx_mem.h"
#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
static void eob_adjust(char *eobs, short *diff)
{
/* eob adjust.... the idct can only skip if both the dc and eob are zero */
@@ -59,4 +63,8 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_INVTRANS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
index 19857a7e9fb..7a07e76fc41 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
@@ -15,7 +15,6 @@
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
-typedef unsigned char uc;
static void lf_init_lut(loop_filter_info_n *lfi)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.h
index 1e47f349073..20a6bd375b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.h
@@ -9,13 +9,17 @@
*/
-#ifndef loopfilter_h
-#define loopfilter_h
+#ifndef VP8_COMMON_LOOPFILTER_H_
+#define VP8_COMMON_LOOPFILTER_H_
#include "vpx_ports/mem.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define MAX_LOOP_FILTER 63
/* fraction of total macroblock rows to be used in fast filter level picking */
/* has to be > 2 */
@@ -102,4 +106,8 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_LOOPFILTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/modecont.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/modecont.h
index 24db88295f9..ff34c33c557 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/modecont.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/modecont.h
@@ -9,9 +9,17 @@
*/
-#ifndef __INC_MODECONT_H
-#define __INC_MODECONT_H
+#ifndef VP8_COMMON_MODECONT_H_
+#define VP8_COMMON_MODECONT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern const int vp8_mode_contexts[6][4];
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_MODECONT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mv.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/mv.h
index b3f919db2bd..111ccd63c72 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mv.h
@@ -9,10 +9,14 @@
*/
-#ifndef __INC_MV_H
-#define __INC_MV_H
+#ifndef VP8_COMMON_MV_H_
+#define VP8_COMMON_MV_H_
#include "vpx/vpx_integer.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef struct
{
short row;
@@ -25,4 +29,8 @@ typedef union int_mv
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_MV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
index 30c4cbbca63..119e40cdc9c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_VP8_H
-#define __INC_VP8_H
+#ifndef VP8_COMMON_ONYX_H_
+#define VP8_COMMON_ONYX_H_
#ifdef __cplusplus
extern "C"
@@ -39,8 +39,8 @@ extern "C"
typedef enum
{
- USAGE_STREAM_FROM_SERVER = 0x0,
- USAGE_LOCAL_FILE_PLAYBACK = 0x1,
+ USAGE_LOCAL_FILE_PLAYBACK = 0x0,
+ USAGE_STREAM_FROM_SERVER = 0x1,
USAGE_CONSTRAINED_QUALITY = 0x2,
USAGE_CONSTANT_QUALITY = 0x3
} END_USAGE;
@@ -267,4 +267,4 @@ extern "C"
}
#endif
-#endif
+#endif // VP8_COMMON_ONYX_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxc_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxc_int.h
index e9bb7af26b1..6d89865c600 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxc_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxc_int.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_VP8C_INT_H
-#define __INC_VP8C_INT_H
+#ifndef VP8_COMMON_ONYXC_INT_H_
+#define VP8_COMMON_ONYXC_INT_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
@@ -26,6 +26,10 @@
#include "header.h"
/*#endif*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define MINQ 0
#define MAXQ 127
#define QINDEX_RANGE (MAXQ + 1)
@@ -174,4 +178,8 @@ typedef struct VP8Common
int cpu_caps;
} VP8_COMMON;
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_ONYXC_INT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxd.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxd.h
index 97c81c130a0..e37b29f32cf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyxd.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_VP8D_H
-#define __INC_VP8D_H
+#ifndef VP8_COMMON_ONYXD_H_
+#define VP8_COMMON_ONYXD_H_
/* Create/destroy static data structures. */
@@ -60,4 +60,4 @@ extern "C"
#endif
-#endif
+#endif // VP8_COMMON_ONYXD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
index dd998f16e8a..8e546d5bfbe 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
@@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
};
#endif
-static const short kernel5[] =
-{
- 1, 1, 4, 1, 1
-};
-
const short vp8_rv[] =
{
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
@@ -308,13 +303,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
{
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
}
-
- s[-8*pitch] = d[(r-8)&15];
+ if (r >= 8)
+ s[-8*pitch] = d[(r-8)&15];
s += pitch;
}
}
}
+#if CONFIG_POSTPROC
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
@@ -387,6 +383,7 @@ void vp8_deblock(VP8_COMMON *cm,
vp8_yv12_copy_frame(source, post);
}
}
+#endif
#if !(CONFIG_TEMPORAL_DENOISING)
void vp8_de_noise(VP8_COMMON *cm,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.h
index 495a2c906f6..33d0a7f025b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.h
@@ -9,8 +9,8 @@
*/
-#ifndef POSTPROC_H
-#define POSTPROC_H
+#ifndef VP8_COMMON_POSTPROC_H_
+#define VP8_COMMON_POSTPROC_H_
#include "vpx_ports/mem.h"
struct postproc_state
@@ -26,6 +26,10 @@ struct postproc_state
};
#include "onyxc_int.h"
#include "ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
vp8_ppflags_t *flags);
@@ -47,4 +51,8 @@ void vp8_deblock(struct VP8Common *oci,
#define MFQE_PRECISION 4
void vp8_multiframe_quality_enhance(struct VP8Common *cm);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_POSTPROC_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
index 665e21fd965..768224aad5e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_PPFLAGS_H
-#define __INC_PPFLAGS_H
+#ifndef VP8_COMMON_PPFLAGS_H_
+#define VP8_COMMON_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
enum
{
VP8D_NOFILTERING = 0,
@@ -38,4 +42,8 @@ typedef struct
int display_mv_flag;
} vp8_ppflags_t;
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_PPFLAGS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/pragmas.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/pragmas.h
index 99fee5ae232..329cc8275c2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/pragmas.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/pragmas.h
@@ -8,8 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_PRAGMAS_H_
+#define VP8_COMMON_PRAGMAS_H_
-
+#ifdef __cplusplus
+extern "C" {
+#endif
#ifdef __INTEL_COMPILER
#pragma warning(disable:997 1011 170)
@@ -17,3 +21,9 @@
#ifdef _MSC_VER
#pragma warning(disable:4799)
#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_PRAGMAS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/quant_common.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/quant_common.h
index cb64d8eb8d8..700b5e6d726 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/quant_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/quant_common.h
@@ -8,14 +8,27 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_QUANT_COMMON_H_
+#define VP8_COMMON_QUANT_COMMON_H_
+
#include "string.h"
#include "blockd.h"
#include "onyxc_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
extern int vp8_ac_yquant(int QIndex);
extern int vp8_dc_quant(int QIndex, int Delta);
extern int vp8_dc2quant(int QIndex, int Delta);
extern int vp8_ac2quant(int QIndex, int Delta);
extern int vp8_dc_uv_quant(int QIndex, int Delta);
extern int vp8_ac_uv_quant(int QIndex, int Delta);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_QUANT_COMMON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.h
index 233c02e5ba5..ba979b9664a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_RECONINTER_H
-#define __INC_RECONINTER_H
+#ifndef VP8_COMMON_RECONINTER_H_
+#define VP8_COMMON_RECONINTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
@@ -32,4 +36,8 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_RECONINTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra4x4.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra4x4.h
index d2b0d43461e..ed59c9edd4c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra4x4.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra4x4.h
@@ -9,10 +9,14 @@
*/
-#ifndef __INC_RECONINTRA4x4_H
-#define __INC_RECONINTRA4x4_H
+#ifndef VP8_COMMON_RECONINTRA4X4_H_
+#define VP8_COMMON_RECONINTRA4X4_H_
#include "vp8/common/blockd.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
static void intra_prediction_down_copy(MACROBLOCKD *xd,
unsigned char *above_right_src)
{
@@ -29,4 +33,8 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd,
*dst_ptr2 = *src_ptr;
}
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_RECONINTRA4X4_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
new file mode 100644
index 00000000000..cbfd76a8d16
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
@@ -0,0 +1,561 @@
+sub vp8_common_forward_decls() {
+print <<EOF
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp8_common_forward_decls/;
+
+#
+# system state
+#
+add_proto qw/void vp8_clear_system_state/, "";
+specialize qw/vp8_clear_system_state mmx/;
+$vp8_clear_system_state_mmx=vpx_reset_mmx_state;
+
+#
+# Dequant
+#
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
+specialize qw/vp8_dequantize_b mmx media neon/;
+$vp8_dequantize_b_media=vp8_dequantize_b_v6;
+
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
+specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
+$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
+$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
+$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
+$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
+$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
+$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
+$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
+$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
+
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
+$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
+$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
+
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
+$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
+$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
+
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
+$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
+$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
+
+
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
+$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
+$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
+$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
+$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
+$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
+$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
+$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
+$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
+$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
+$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
+$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
+$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
+$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
+$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
+$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
+$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
+$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
+$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
+$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
+$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+
+#
+# IDCT
+#
+#idct16
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
+specialize qw/vp8_short_idct4x4llm mmx media neon dspr2/;
+$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
+$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
+
+#iwalsh1
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
+$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
+# no asm yet
+
+#iwalsh16
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2/;
+$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
+$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
+
+#idct1_scalar_add
+add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_dc_only_idct_add mmx media neon dspr2/;
+$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
+$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
+
+#
+# RECON
+#
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/;
+$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
+$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
+
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/;
+$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
+$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
+
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/;
+$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
+$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
+
+add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
+#TODO: fix assembly for neon
+
+add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+
+add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
+specialize qw/vp8_intra4x4_predict media/;
+$vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6;
+
+#
+# Postproc
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes") {
+ add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vp8_mbpost_proc_down mmx sse2/;
+ $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
+
+ add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vp8_mbpost_proc_across_ip sse2/;
+ $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
+
+ add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+ specialize qw/vp8_post_proc_down_and_across_mb_row sse2/;
+
+ add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
+ specialize qw/vp8_plane_add_noise mmx sse2/;
+ $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
+
+ add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+ # no asm yet
+
+ add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+ # no asm yet
+
+ add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+ # no asm yet
+
+ add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+ specialize qw/vp8_filter_by_weight16x16 sse2/;
+
+ add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+ specialize qw/vp8_filter_by_weight8x8 sse2/;
+
+ add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+ # no asm yet
+}
+
+#
+# Subpixel
+#
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
+$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
+$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
+$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
+
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2/;
+$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
+$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
+
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon/;
+$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon/;
+$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 mmx media neon/;
+$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
+
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
+$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
+
+#
+# Whole-pixel Variance
+#
+add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance4x4 mmx sse2/;
+$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
+
+add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance8x8 mmx sse2 media neon/;
+$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
+$vp8_variance8x8_media=vp8_variance8x8_armv6;
+
+add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance8x16 mmx sse2 neon/;
+$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
+
+add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance16x8 mmx sse2 neon/;
+$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
+
+add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance16x16 mmx sse2 media neon/;
+$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
+$vp8_variance16x16_media=vp8_variance16x16_armv6;
+
+#
+# Sub-pixel Variance
+#
+add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
+$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon_asm/;
+$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
+$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
+$vp8_sub_pixel_variance8x8_neon_asm=vp8_sub_pixel_variance8x8_neon;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
+$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
+$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
+$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
+$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
+$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon_asm/;
+$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
+$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
+$vp8_variance_halfpixvar16x16_h_neon_asm=vp8_variance_halfpixvar16x16_h_neon;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon_asm/;
+$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
+$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
+$vp8_variance_halfpixvar16x16_v_neon_asm=vp8_variance_halfpixvar16x16_v_neon;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon_asm/;
+$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
+$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
+$vp8_variance_halfpixvar16x16_hv_neon_asm=vp8_variance_halfpixvar16x16_hv_neon;
+
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad4x4 mmx sse2 neon/;
+$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
+
+add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad8x8 mmx sse2 neon/;
+$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
+
+add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad8x16 mmx sse2 neon/;
+$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
+
+add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad16x8 mmx sse2 neon/;
+$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
+
+add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
+$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
+$vp8_sad16x16_media=vp8_sad16x16_armv6;
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad4x4x3 sse3/;
+
+add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x8x3 sse3/;
+
+add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x16x3 sse3/;
+
+add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x16x3 sse3 ssse3/;
+
+# Note the only difference in the following prototypes is that they return into
+# an array of short
+add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad4x4x8 sse4_1/;
+$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
+
+add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad8x8x8 sse4_1/;
+$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
+
+add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad8x16x8 sse4_1/;
+$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
+
+add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad16x8x8 sse4_1/;
+$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
+
+add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad16x16x8 sse4_1/;
+$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad4x4x4d sse3/;
+
+add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x8x4d sse3/;
+
+add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x16x4d sse3/;
+
+add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x8x4d sse3/;
+
+add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x16x4d sse3/;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
+
+#
+# Sum of squares (vector)
+#
+add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
+specialize qw/vp8_get_mb_ss mmx sse2/;
+
+#
+# SSE (Sum Squared Error)
+#
+add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
+$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
+
+add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp8_mse16x16 mmx sse2 media neon_asm/;
+$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
+$vp8_mse16x16_media=vp8_mse16x16_armv6;
+$vp8_mse16x16_neon_asm=vp8_mse16x16_neon;
+
+add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+specialize qw/vp8_get4x4sse_cs mmx neon_asm/;
+$vp8_get4x4sse_cs_neon_asm=vp8_get4x4sse_cs_neon;
+
+#
+# Block copy
+#
+if ($opts{arch} =~ /x86/) {
+ add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n";
+ specialize qw/vp8_copy32xn sse2 sse3/;
+}
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ $opts{arch} eq "x86_64" and $sse2_on_x86_64 = "sse2";
+
+ add_proto qw/void vp8_ssim_parms_8x8/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+ specialize qw/vp8_ssim_parms_8x8/, "$sse2_on_x86_64";
+
+ add_proto qw/void vp8_ssim_parms_16x16/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+ specialize qw/vp8_ssim_parms_16x16/, "$sse2_on_x86_64";
+}
+
+#
+# Forward DCT
+#
+add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
+$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
+
+add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
+$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
+
+add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
+$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
+$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
+
+#
+# Quantizer
+#
+add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_regular_quantize_b sse2/;
+# TODO(johann) Update sse4 implementation and re-enable
+#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+
+add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
+$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
+$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
+
+add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
+# no asm yet
+
+add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
+specialize qw/vp8_fast_quantize_b_pair neon_asm/;
+$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
+
+add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
+specialize qw/vp8_quantize_mb neon/;
+
+add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
+specialize qw/vp8_quantize_mby neon/;
+
+add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
+specialize qw/vp8_quantize_mbuv neon/;
+
+#
+# Block subtraction
+#
+add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
+specialize qw/vp8_block_error mmx sse2/;
+$vp8_block_error_sse2=vp8_block_error_xmm;
+
+add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
+specialize qw/vp8_mbblock_error mmx sse2/;
+$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
+
+add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
+specialize qw/vp8_mbuverror mmx sse2/;
+$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
+
+add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
+specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
+$vp8_subtract_b_media=vp8_subtract_b_armv6;
+$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
+
+add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
+specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
+$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
+$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
+
+add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
+$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
+$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
+
+#
+# Motion search
+#
+add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_full_search_sad sse3 sse4_1/;
+$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
+$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
+
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_refining_search_sad sse3/;
+$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+
+add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+$vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
+ specialize qw/vp8_temporal_filter_apply sse2/;
+}
+
+#
+# Pick Loopfilter
+#
+add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
+$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
+
+#
+# Denoiser filter
+#
+if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
+ add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+ specialize qw/vp8_denoiser_filter sse2 neon/;
+}
+
+# End of encoder only functions
+}
+1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.sh b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.sh
deleted file mode 100755
index 9ebf389d8c6..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.sh
+++ /dev/null
@@ -1,542 +0,0 @@
-vp8_common_forward_decls() {
-cat <<EOF
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls vp8_common_forward_decls
-
-#
-# system state
-#
-prototype void vp8_clear_system_state ""
-specialize vp8_clear_system_state mmx
-vp8_clear_system_state_mmx=vpx_reset_mmx_state
-
-#
-# Dequant
-#
-prototype void vp8_dequantize_b "struct blockd*, short *dqc"
-specialize vp8_dequantize_b mmx media neon
-vp8_dequantize_b_media=vp8_dequantize_b_v6
-
-prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
-specialize vp8_dequant_idct_add mmx media neon dspr2
-vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
-vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2
-
-prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
-specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2
-vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
-vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
-
-prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
-specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2
-vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
-vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
-
-#
-# Loopfilter
-#
-prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2
-vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
-vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2
-
-prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv mmx sse2 media neon dspr2
-vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
-vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2
-
-prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2
-vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
-vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2
-
-prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bh mmx sse2 media neon dspr2
-vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
-vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2
-
-
-prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
-vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c
-vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx
-vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2
-vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6
-vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon
-
-prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_mbh mmx sse2 media neon
-vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c
-vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx
-vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2
-vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6
-vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon
-
-prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_bv mmx sse2 media neon
-vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c
-vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx
-vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2
-vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6
-vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon
-
-prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_bh mmx sse2 media neon
-vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c
-vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx
-vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
-vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
-vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
-
-#
-# IDCT
-#
-#idct16
-prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
-specialize vp8_short_idct4x4llm mmx media neon dspr2
-vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
-vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2
-
-#iwalsh1
-prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
-specialize vp8_short_inv_walsh4x4_1 dspr2
-vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2
-# no asm yet
-
-#iwalsh16
-prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
-specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2
-vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
-vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2
-
-#idct1_scalar_add
-prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
-specialize vp8_dc_only_idct_add mmx media neon dspr2
-vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
-vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2
-
-#
-# RECON
-#
-prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2
-vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
-vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2
-
-prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x8 mmx media neon dspr2
-vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
-vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2
-
-prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x4 mmx media neon dspr2
-vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
-vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2
-
-prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
-specialize vp8_build_intra_predictors_mby_s sse2 ssse3
-#TODO: fix assembly for neon
-
-prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
-specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
-
-prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
-specialize vp8_intra4x4_predict media
-vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
-
-#
-# Postproc
-#
-if [ "$CONFIG_POSTPROC" = "yes" ]; then
- prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit"
- specialize vp8_mbpost_proc_down mmx sse2
- vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm
-
- prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit"
- specialize vp8_mbpost_proc_across_ip sse2
- vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
-
- prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
- specialize vp8_post_proc_down_and_across_mb_row sse2
-
- prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
- specialize vp8_plane_add_noise mmx sse2
- vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt
-
- prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
- # no asm yet
-
- prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
- # no asm yet
-
- prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
- # no asm yet
-
- prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
- specialize vp8_filter_by_weight16x16 sse2
-
- prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
- specialize vp8_filter_by_weight8x8 sse2
-
- prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
- # no asm yet
-fi
-
-#
-# Subpixel
-#
-prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
-vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2
-
-prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
-vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2
-
-prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
-vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2
-
-prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2
-vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
-vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2
-
-prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
-vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6
-
-prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon
-vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6
-
-prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict8x4 mmx media neon
-vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6
-
-prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict4x4 mmx media neon
-vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6
-
-#
-# Whole-pixel Variance
-#
-prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance4x4 mmx sse2
-vp8_variance4x4_sse2=vp8_variance4x4_wmt
-
-prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance8x8 mmx sse2 media neon
-vp8_variance8x8_sse2=vp8_variance8x8_wmt
-vp8_variance8x8_media=vp8_variance8x8_armv6
-
-prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance8x16 mmx sse2 neon
-vp8_variance8x16_sse2=vp8_variance8x16_wmt
-
-prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance16x8 mmx sse2 neon
-vp8_variance16x8_sse2=vp8_variance16x8_wmt
-
-prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance16x16 mmx sse2 media neon
-vp8_variance16x16_sse2=vp8_variance16x16_wmt
-vp8_variance16x16_media=vp8_variance16x16_armv6
-
-#
-# Sub-pixel Variance
-#
-prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance4x4 mmx sse2
-vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon
-vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
-vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6
-
-prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance8x16 mmx sse2
-vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
-
-prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3
-vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
-
-prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon
-vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
-vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon
-vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
-vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon
-vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
-vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon
-vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
-vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6
-
-#
-# Single block SAD
-#
-prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad4x4 mmx sse2 neon
-vp8_sad4x4_sse2=vp8_sad4x4_wmt
-
-prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad8x8 mmx sse2 neon
-vp8_sad8x8_sse2=vp8_sad8x8_wmt
-
-prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad8x16 mmx sse2 neon
-vp8_sad8x16_sse2=vp8_sad8x16_wmt
-
-prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad16x8 mmx sse2 neon
-vp8_sad16x8_sse2=vp8_sad16x8_wmt
-
-prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad16x16 mmx sse2 sse3 media neon
-vp8_sad16x16_sse2=vp8_sad16x16_wmt
-vp8_sad16x16_media=vp8_sad16x16_armv6
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp8_sad4x4x3 sse3
-
-prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x8x3 sse3
-
-prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x16x3 sse3
-
-prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x8x3 sse3 ssse3
-
-prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x16x3 sse3 ssse3
-
-# Note the only difference in the following prototypes is that they return into
-# an array of short
-prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp8_sad4x4x8 sse4_1
-vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
-
-prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp8_sad8x8x8 sse4_1
-vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
-
-prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp8_sad8x16x8 sse4_1
-vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
-
-prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp8_sad16x8x8 sse4_1
-vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
-
-prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp8_sad16x16x8 sse4_1
-vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp8_sad4x4x4d sse3
-
-prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x8x4d sse3
-
-prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x16x4d sse3
-
-prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x8x4d sse3
-
-prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x16x4d sse3
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
-
-#
-# Sum of squares (vector)
-#
-prototype unsigned int vp8_get_mb_ss "const short *"
-specialize vp8_get_mb_ss mmx sse2
-
-#
-# SSE (Sum Squared Error)
-#
-prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_mse16x16 mmx sse2
-vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
-
-prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp8_mse16x16 mmx sse2 media neon
-vp8_mse16x16_sse2=vp8_mse16x16_wmt
-vp8_mse16x16_media=vp8_mse16x16_armv6
-
-prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"
-specialize vp8_get4x4sse_cs mmx neon
-
-#
-# Block copy
-#
-case $arch in
- x86*)
- prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
- specialize vp8_copy32xn sse2 sse3
- ;;
-esac
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
- [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
- prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
-
- prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
-fi
-
-#
-# Forward DCT
-#
-prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch"
-specialize vp8_short_fdct4x4 mmx sse2 media neon
-vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6
-
-prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch"
-specialize vp8_short_fdct8x4 mmx sse2 media neon
-vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6
-
-prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch"
-specialize vp8_short_walsh4x4 sse2 media neon
-vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
-
-#
-# Quantizer
-#
-prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
-specialize vp8_regular_quantize_b sse2 #sse4_1
-# TODO(johann) Update sse4 implementation and re-enable
-#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
-
-prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
-specialize vp8_fast_quantize_b sse2 ssse3 media neon
-vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6
-
-prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
-# no asm yet
-
-prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
-specialize vp8_fast_quantize_b_pair neon
-
-prototype void vp8_quantize_mb "struct macroblock *"
-specialize vp8_quantize_mb neon
-
-prototype void vp8_quantize_mby "struct macroblock *"
-specialize vp8_quantize_mby neon
-
-prototype void vp8_quantize_mbuv "struct macroblock *"
-specialize vp8_quantize_mbuv neon
-
-#
-# Block subtraction
-#
-prototype int vp8_block_error "short *coeff, short *dqcoeff"
-specialize vp8_block_error mmx sse2
-vp8_block_error_sse2=vp8_block_error_xmm
-
-prototype int vp8_mbblock_error "struct macroblock *mb, int dc"
-specialize vp8_mbblock_error mmx sse2
-vp8_mbblock_error_sse2=vp8_mbblock_error_xmm
-
-prototype int vp8_mbuverror "struct macroblock *mb"
-specialize vp8_mbuverror mmx sse2
-vp8_mbuverror_sse2=vp8_mbuverror_xmm
-
-prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp8_subtract_b mmx sse2 media neon
-vp8_subtract_b_media=vp8_subtract_b_armv6
-
-prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"
-specialize vp8_subtract_mby mmx sse2 media neon
-vp8_subtract_mby_media=vp8_subtract_mby_armv6
-
-prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"
-specialize vp8_subtract_mbuv mmx sse2 media neon
-vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6
-
-#
-# Motion search
-#
-prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-specialize vp8_full_search_sad sse3 sse4_1
-vp8_full_search_sad_sse3=vp8_full_search_sadx3
-vp8_full_search_sad_sse4_1=vp8_full_search_sadx8
-
-prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-specialize vp8_refining_search_sad sse3
-vp8_refining_search_sad_sse3=vp8_refining_search_sadx4
-
-prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4
-
-#
-# Alt-ref Noise Reduction (ARNR)
-#
-if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then
- prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
- specialize vp8_temporal_filter_apply sse2
-fi
-
-#
-# Pick Loopfilter
-#
-prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
-specialize vp8_yv12_copy_partial_frame neon
-
-#
-# Denoiser filter
-#
-if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then
- prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"
- specialize vp8_denoiser_filter sse2
-fi
-
-# End of encoder only functions
-fi
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.h
index e515c3a005b..608f4a9ac25 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.h
@@ -8,8 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_SETUPINTRARECON_H_
+#define VP8_COMMON_SETUPINTRARECON_H_
#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
@@ -31,3 +37,9 @@ void setup_intra_recon_left(unsigned char *y_buffer,
for (i = 0; i < 8; i++)
v_buffer[uv_stride *i] = (unsigned char) 129;
}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SETUPINTRARECON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/swapyv12buffer.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/swapyv12buffer.h
index a6473ed92be..1d66cd3d62e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/swapyv12buffer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/swapyv12buffer.h
@@ -9,11 +9,19 @@
*/
-#ifndef SWAPYV12_BUFFER_H
-#define SWAPYV12_BUFFER_H
+#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
+#define VP8_COMMON_SWAPYV12BUFFER_H_
#include "vpx_scale/yv12config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/systemdependent.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/systemdependent.h
index e6b0456f75b..3d44e37cf24 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/systemdependent.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/systemdependent.h
@@ -8,8 +8,20 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VP8_COMMON_SYSTEMDEPENDENT_H_
#include "vpx_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
struct VP8Common;
void vp8_machine_specific_config(struct VP8Common *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
index ed9e3e60dcb..01c82dbb805 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
@@ -9,8 +9,12 @@
*/
-#ifndef _PTHREAD_EMULATION
-#define _PTHREAD_EMULATION
+#ifndef VP8_COMMON_THREADING_H_
+#define VP8_COMMON_THREADING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
@@ -183,4 +187,8 @@ static inline int sem_destroy(sem_t * sem)
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_THREADING_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/treecoder.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/treecoder.h
index ebf51c5ed65..d22b7c570cb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/treecoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/treecoder.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_TREECODER_H
-#define __INC_TREECODER_H
+#ifndef VP8_COMMON_TREECODER_H_
+#define VP8_COMMON_TREECODER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
typedef unsigned char vp8bc_index_t; /* probability index */
@@ -87,4 +91,8 @@ void vp8bc_tree_probs_from_distribution(
);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_TREECODER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
index 01193b8d724..89a32a72268 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
@@ -9,11 +9,15 @@
*/
-#ifndef VARIANCE_H
-#define VARIANCE_H
+#ifndef VP8_COMMON_VARIANCE_H_
+#define VP8_COMMON_VARIANCE_H_
#include "vpx_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef unsigned int(*vp8_sad_fn_t)(
const unsigned char *src_ptr,
int source_stride,
@@ -112,4 +116,8 @@ typedef struct variance_vtable
#endif
} vp8_variance_fn_ptr_t;
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_COMMON_VARIANCE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/vp8_entropymodedata.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/vp8_entropymodedata.h
index 13e9a92fc10..c4aed498970 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/vp8_entropymodedata.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/vp8_entropymodedata.h
@@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
/*Generated file, included by entropymode.c*/
@@ -240,3 +246,9 @@ const vp8_prob vp8_kf_bmode_prob
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 }
}
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/filter_x86.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/filter_x86.h
index cfadaeecbc1..d282841bee4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/filter_x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/filter_x86.h
@@ -8,11 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef FILTER_X86_H
-#define FILTER_X86_H
+#ifndef VP8_COMMON_X86_FILTER_X86_H_
+#define VP8_COMMON_X86_FILTER_X86_H_
#include "vpx_ports/mem.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
* duplicated values */
@@ -22,4 +26,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
/* duplicated 8x */
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-#endif /* FILTER_X86_H */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
index f388d247681..88a07b9f3fa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
@@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
pxor mm7, [GLOBAL(t80)] ; unoffset
; mm7 = q1
- ; tranpose and write back
+ ; transpose and write back
; mm1 = 72 62 52 42 32 22 12 02
; mm6 = 73 63 53 43 33 23 13 03
; mm3 = 74 64 54 44 34 24 14 04
@@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
- ; tranpose and write back
+ ; transpose and write back
movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
index a66753ba806..1913abc69b0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
; start work on filters
B_FILTER 2
- ; tranpose and write back - only work on q1, q0, p0, p1
+ ; transpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
; store 16-line result
@@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
; start work on filters
B_FILTER 2
- ; tranpose and write back - only work on q1, q0, p0, p1
+ ; transpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_mmx.asm
index 5cf110b5326..8be3431f9b0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_mmx.asm
@@ -204,13 +204,16 @@ sym(vp8_mbpost_proc_down_mmx):
and rcx, 15
movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
+ cmp edx, 8
+ jl .skip_assignment
+
mov rcx, rdx
sub rcx, 8
-
and rcx, 15
movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
movd [rsi], mm1
+
+.skip_assignment
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm
index 00f84a31b21..f53daa7e508 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/postproc_sse2.asm
@@ -425,13 +425,16 @@ sym(vp8_mbpost_proc_down_xmm):
and rcx, 15
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
+ cmp edx, 8
+ jl .skip_assignment
+
mov rcx, rdx
sub rcx, 8
-
and rcx, 15
movq mm0, [rsp + rcx*8] ;d[rcx*8]
-
movq [rsi], mm0
+
+.skip_assignment
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/recon_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/recon_sse2.asm
index 1434bcd9379..7141f832463 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/recon_sse2.asm
@@ -365,6 +365,7 @@ sym(vp8_intra_pred_uv_tm_%1):
GET_GOT rbx
push rsi
push rdi
+ push rbx
; end prolog
; read top row
@@ -395,8 +396,11 @@ sym(vp8_intra_pred_uv_tm_%1):
movsxd rcx, dword ptr arg(1) ;dst_stride
.vp8_intra_pred_uv_tm_%1_loop:
- movd xmm3, [rsi]
- movd xmm5, [rsi+rax]
+ mov bl, [rsi]
+ movd xmm3, ebx
+
+ mov bl, [rsi+rax]
+ movd xmm5, ebx
%ifidn %1, sse2
punpcklbw xmm3, xmm0
punpcklbw xmm5, xmm0
@@ -419,6 +423,7 @@ sym(vp8_intra_pred_uv_tm_%1):
jnz .vp8_intra_pred_uv_tm_%1_loop
; begin epilog
+ pop rbx
pop rdi
pop rsi
RESTORE_GOT
@@ -486,10 +491,8 @@ sym(vp8_intra_pred_uv_ho_%1):
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
-%ifidn %1, ssse3
-%ifndef GET_GOT_SAVE_ARG
push rbx
-%endif
+%ifidn %1, ssse3
GET_GOT rbx
%endif
; end prolog
@@ -507,13 +510,16 @@ sym(vp8_intra_pred_uv_ho_%1):
%ifidn %1, ssse3
lea rdx, [rcx*3]
movdqa xmm2, [GLOBAL(dc_00001111)]
- lea rbx, [rax*3]
%endif
%ifidn %1, mmx2
.vp8_intra_pred_uv_ho_%1_loop:
- movd mm0, [rsi]
- movd mm1, [rsi+rax]
+ mov bl, [rsi]
+ movd mm0, ebx
+
+ mov bl, [rsi+rax]
+ movd mm1, ebx
+
punpcklbw mm0, mm0
punpcklbw mm1, mm1
pshufw mm0, mm0, 0x0
@@ -525,10 +531,19 @@ sym(vp8_intra_pred_uv_ho_%1):
dec edx
jnz .vp8_intra_pred_uv_ho_%1_loop
%else
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
+ mov bl, [rsi]
+ movd xmm0, ebx
+
+ mov bl, [rsi+rax]
+ movd xmm3, ebx
+
+ mov bl, [rsi+rax*2]
+ movd xmm1, ebx
+
+ lea rbx, [rax*3]
+ mov bl, [rsi+rbx]
+ movd xmm4, ebx
+
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
@@ -539,10 +554,20 @@ sym(vp8_intra_pred_uv_ho_%1):
movhps [rdi+rdx], xmm1
lea rsi, [rsi+rax*4]
lea rdi, [rdi+rcx*4]
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
+
+ mov bl, [rsi]
+ movd xmm0, ebx
+
+ mov bl, [rsi+rax]
+ movd xmm3, ebx
+
+ mov bl, [rsi+rax*2]
+ movd xmm1, ebx
+
+ lea rbx, [rax*3]
+ mov bl, [rsi+rbx]
+ movd xmm4, ebx
+
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
@@ -556,10 +581,8 @@ sym(vp8_intra_pred_uv_ho_%1):
; begin epilog
%ifidn %1, ssse3
RESTORE_GOT
-%ifndef GET_GOT_SAVE_ARG
- pop rbx
-%endif
%endif
+ pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
@@ -893,6 +916,7 @@ sym(vp8_intra_pred_y_tm_%1):
SAVE_XMM 7
push rsi
push rdi
+ push rbx
GET_GOT rbx
; end prolog
@@ -926,8 +950,11 @@ sym(vp8_intra_pred_y_tm_%1):
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_y_tm_%1_loop:
- movd xmm4, [rsi]
- movd xmm5, [rsi+rax]
+ mov bl, [rsi]
+ movd xmm4, ebx
+
+ mov bl, [rsi+rax]
+ movd xmm5, ebx
%ifidn %1, sse2
punpcklbw xmm4, xmm0
punpcklbw xmm5, xmm0
@@ -956,6 +983,7 @@ vp8_intra_pred_y_tm_%1_loop:
; begin epilog
RESTORE_GOT
+ pop rbx
pop rdi
pop rsi
RESTORE_XMM
@@ -1029,6 +1057,7 @@ sym(vp8_intra_pred_y_ho_sse2):
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
+ push rbx
; end prolog
;arg(2) not used
@@ -1041,8 +1070,11 @@ sym(vp8_intra_pred_y_ho_sse2):
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_y_ho_sse2_loop:
- movd xmm0, [rsi]
- movd xmm1, [rsi+rax]
+ mov bl, [rsi]
+ movd xmm0, ebx
+ mov bl, [rsi+rax]
+ movd xmm1, ebx
+
; FIXME use pshufb for ssse3 version
punpcklbw xmm0, xmm0
punpcklbw xmm1, xmm1
@@ -1058,6 +1090,7 @@ vp8_intra_pred_y_ho_sse2_loop:
jnz vp8_intra_pred_y_ho_sse2_loop
; begin epilog
+ pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.c
index 0007d7a7a3e..b874d4c46c0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.c
@@ -10,11 +10,12 @@
#include "dboolhuff.h"
+#include "vp8/common/common.h"
int vp8dx_start_decode(BOOL_DECODER *br,
const unsigned char *source,
unsigned int source_sz,
- vp8_decrypt_cb *decrypt_cb,
+ vpx_decrypt_cb decrypt_cb,
void *decrypt_state)
{
br->user_buffer_end = source+source_sz;
@@ -39,7 +40,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
const unsigned char *bufptr = br->user_buffer;
VP8_BD_VALUE value = br->value;
int count = br->count;
- int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8);
+ int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
size_t bytes_left = br->user_buffer_end - bufptr;
size_t bits_left = bytes_left * CHAR_BIT;
int x = (int)(shift + CHAR_BIT - bits_left);
@@ -47,7 +48,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
if (br->decrypt_cb) {
- size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
+ size_t n = MIN(sizeof(decrypted), bytes_left);
br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
bufptr = decrypted;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.h
index 4c0ca1ce737..51c5adc28ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/dboolhuff.h
@@ -9,16 +9,21 @@
*/
-#ifndef DBOOLHUFF_H_
-#define DBOOLHUFF_H_
+#ifndef VP8_DECODER_DBOOLHUFF_H_
+#define VP8_DECODER_DBOOLHUFF_H_
#include <stddef.h>
#include <limits.h>
#include "vpx_config.h"
#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
#include "vpx/vpx_integer.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef size_t VP8_BD_VALUE;
#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
@@ -28,12 +33,6 @@ typedef size_t VP8_BD_VALUE;
Even relatively modest values like 100 would work fine.*/
#define VP8_LOTS_OF_BITS (0x40000000)
-/*Decrypt n bytes of data from input -> output, using the decrypt_state
- passed in VP8D_SET_DECRYPTOR.
-*/
-typedef void (vp8_decrypt_cb)(void *decrypt_state, const unsigned char *input,
- unsigned char *output, int count);
-
typedef struct
{
const unsigned char *user_buffer_end;
@@ -41,7 +40,7 @@ typedef struct
VP8_BD_VALUE value;
int count;
unsigned int range;
- vp8_decrypt_cb *decrypt_cb;
+ vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
} BOOL_DECODER;
@@ -50,7 +49,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
int vp8dx_start_decode(BOOL_DECODER *br,
const unsigned char *source,
unsigned int source_sz,
- vp8_decrypt_cb *decrypt_cb,
+ vpx_decrypt_cb decrypt_cb,
void *decrypt_state);
void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
@@ -135,4 +134,8 @@ static int vp8dx_bool_error(BOOL_DECODER *br)
return 0;
}
-#endif // DBOOLHUFF_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DBOOLHUFF_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
index 16da78a2c1e..e7cf0d9b9c6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
@@ -17,6 +17,7 @@
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/reconinter.h"
#include "detokenize.h"
+#include "vp8/common/common.h"
#include "vp8/common/invtrans.h"
#include "vp8/common/alloccommon.h"
#include "vp8/common/entropymode.h"
@@ -631,9 +632,17 @@ static void decode_mb_rows(VP8D_COMP *pbi)
xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
- xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
- xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
- xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+ if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
+ MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+ xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
+ xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
+ xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
+ } else {
+ // ref_frame is INTRA_FRAME, pre buffer should not be used.
+ xd->pre.y_buffer = 0;
+ xd->pre.u_buffer = 0;
+ xd->pre.v_buffer = 0;
+ }
/* propagate errors from reference frames */
xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
@@ -680,7 +689,6 @@ static void decode_mb_rows(VP8D_COMP *pbi)
vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
recon_y_stride, recon_uv_stride,
lf_dst[0], lf_dst[1], lf_dst[2]);
-
if(mb_row > 1)
{
yv12_extend_frame_left_right_c(yv12_fb_new,
@@ -691,10 +699,6 @@ static void decode_mb_rows(VP8D_COMP *pbi)
eb_dst[0] += recon_y_stride * 16;
eb_dst[1] += recon_uv_stride * 8;
eb_dst[2] += recon_uv_stride * 8;
-
- if(mb_row == 2)
- yv12_extend_frame_top_c(yv12_fb_new);
-
}
lf_dst[0] += recon_y_stride * 16;
@@ -713,13 +717,9 @@ static void decode_mb_rows(VP8D_COMP *pbi)
eb_dst[0],
eb_dst[1],
eb_dst[2]);
-
eb_dst[0] += recon_y_stride * 16;
eb_dst[1] += recon_uv_stride * 8;
eb_dst[2] += recon_uv_stride * 8;
-
- if(mb_row == 1)
- yv12_extend_frame_top_c(yv12_fb_new);
}
}
}
@@ -747,7 +747,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
eb_dst[0],
eb_dst[1],
eb_dst[2]);
-
+ yv12_extend_frame_top_c(yv12_fb_new);
yv12_extend_frame_bottom_c(yv12_fb_new);
}
@@ -1019,8 +1019,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
const unsigned char *clear = data;
if (pbi->decrypt_cb)
{
- int n = (int)(data_end - data);
- if (n > 10) n = 10;
+ int n = (int)MIN(sizeof(clear_buffer), data_end - data);
pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
clear = clear_buffer;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.h
index 05a33d27f6a..f33b07351d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.h
@@ -8,11 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef DECODEMV_H_
-#define DECODEMV_H_
+#ifndef VP8_DECODER_DECODEMV_H_
+#define VP8_DECODER_DECODEMV_H_
#include "onyxd_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_decode_mode_mvs(VP8D_COMP *);
-#endif // DECODEMV_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DECODEMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decoderthreading.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decoderthreading.h
index bc716e489e0..c563cf6e93a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decoderthreading.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decoderthreading.h
@@ -8,8 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef DECODERTHREADING_H_
-#define DECODERTHREADING_H_
+#ifndef VP8_DECODER_DECODERTHREADING_H_
+#define VP8_DECODER_DECODERTHREADING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#if CONFIG_MULTITHREAD
void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
@@ -19,4 +23,8 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
#endif
-#endif // DECODERTHREADING_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DECODERTHREADING_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.h
index f2130b36178..f0b125444f0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.h
@@ -8,12 +8,20 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef DETOKENIZE_H_
-#define DETOKENIZE_H_
+#ifndef VP8_DECODER_DETOKENIZE_H_
+#define VP8_DECODER_DETOKENIZE_H_
#include "onyxd_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
-#endif // DETOKENIZE_H
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DETOKENIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/ec_types.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/ec_types.h
index b24bfd9439f..3af5ca86b4b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/ec_types.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/ec_types.h
@@ -8,8 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP8_DEC_EC_TYPES_H
-#define VP8_DEC_EC_TYPES_H
+#ifndef VP8_DECODER_EC_TYPES_H_
+#define VP8_DECODER_EC_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#define MAX_OVERLAPS 16
@@ -47,4 +51,8 @@ typedef struct
MV_REFERENCE_FRAME ref_frame;
} EC_BLOCK;
-#endif // VP8_DEC_EC_TYPES_H
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_EC_TYPES_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
index 0b58c98fd88..4b304c83c78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
@@ -15,9 +15,7 @@
#include "decodemv.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/findnearmv.h"
-
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
+#include "vp8/common/common.h"
#define FLOOR(x,q) ((x) & -(1 << (q)))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.h
index fb96b3605ec..9a1e024865d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.h
@@ -9,12 +9,16 @@
*/
-#ifndef ERROR_CONCEALMENT_H_
-#define ERROR_CONCEALMENT_H_
+#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_
+#define VP8_DECODER_ERROR_CONCEALMENT_H_
#include "onyxd_int.h"
#include "ec_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Allocate memory for the overlap lists */
int vp8_alloc_overlap_lists(VP8D_COMP *pbi);
@@ -38,4 +42,8 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
*/
void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
-#endif // ERROR_CONCEALMENT_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_ERROR_CONCEALMENT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
index 2d9e343bc9f..29fea616bc8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
@@ -178,12 +178,6 @@ vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_
return pbi->common.error.error_code;
}
-/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
-#if HAVE_NEON
-extern void vp8_push_neon(int64_t *store);
-extern void vp8_pop_neon(int64_t *store);
-#endif
-
static int get_free_fb (VP8_COMMON *cm)
{
int i;
@@ -307,9 +301,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
const uint8_t *source,
int64_t time_stamp)
{
-#if HAVE_NEON
- int64_t dx_store_reg[8];
-#endif
VP8_COMMON *cm = &pbi->common;
int retcode = -1;
@@ -319,15 +310,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
if(retcode <= 0)
return retcode;
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_push_neon(dx_store_reg);
- }
-#endif
-
cm->new_fb_idx = get_free_fb (cm);
/* setup reference frames for vp8_decode_frame */
@@ -403,15 +385,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
pbi->last_time_stamp = time_stamp;
decode_exit:
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_pop_neon(dx_store_reg);
- }
-#endif
-
pbi->common.error.setjmp = 0;
return retcode;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
index 54a98f7cc32..aa2cc57f7b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
@@ -9,8 +9,8 @@
*/
-#ifndef ONYXD_INT_H_
-#define ONYXD_INT_H_
+#ifndef VP8_DECODER_ONYXD_INT_H_
+#define VP8_DECODER_ONYXD_INT_H_
#include "vpx_config.h"
#include "vp8/common/onyxd.h"
@@ -22,6 +22,10 @@
#include "ec_types.h"
#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef struct
{
int ithread;
@@ -122,7 +126,7 @@ typedef struct VP8D_COMP
int independent_partitions;
int frame_corrupt_residual;
- vp8_decrypt_cb *decrypt_cb;
+ vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
} VP8D_COMP;
@@ -148,4 +152,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
} while(0)
#endif
-#endif // ONYXD_INT_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_ONYXD_INT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/treereader.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/treereader.h
index 9393bb47857..35ee696000d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/treereader.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/treereader.h
@@ -9,12 +9,16 @@
*/
-#ifndef TREEREADER_H_
-#define TREEREADER_H_
+#ifndef VP8_DECODER_TREEREADER_H_
+#define VP8_DECODER_TREEREADER_H_
#include "vp8/common/treecoder.h"
#include "dboolhuff.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef BOOL_DECODER vp8_reader;
#define vp8_read vp8dx_decode_bool
@@ -37,4 +41,8 @@ static int vp8_treed_read(
return -i;
}
-#endif // TREEREADER_H_
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_TREEREADER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c
new file mode 100644
index 00000000000..32ce65abf4c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/encoder/denoising.h"
+#include "vpx_mem/vpx_mem.h"
+#include "./vp8_rtcd.h"
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ *
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ * adjustment = filtered_value - current_raw
+ * = (filter_coefficient * diff + 128) >> 8
+ * where
+ * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3));
+ * filter_coefficient += filter_coefficient /
+ * (3 + motion_magnitude_adjustment);
+ * filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff level adjustment w/o adjustment w/
+ * motion correction motion correction
+ * [-255, -16] 3 -6 -7
+ * [-15, -8] 2 -4 -5
+ * [-7, -4] 1 -3 -4
+ * [-3, 3] 0 diff diff
+ * [4, 7] 1 3 4
+ * [8, 15] 2 4 5
+ * [16, 255] 3 6 7
+ */
+
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
+ int mc_running_avg_y_stride,
+ unsigned char *running_avg_y,
+ int running_avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising) {
+ /* If motion_magnitude is small, making the denoiser more aggressive by
+ * increasing the adjustment for each level, level1 adjustment is
+ * increased, the deltas stay the same.
+ */
+ const uint8x16_t v_level1_adjustment = vdupq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+ int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+
+ /* Go over lines. */
+ int i;
+ for (i = 0; i < 16; ++i) {
+ /* Load inputs. */
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
+ v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
+ v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
+ v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
+ v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
+ v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
+ v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+ }
+
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_running_avg_y_stride;
+ running_avg_y += running_avg_y_stride;
+ }
+
+ /* Too much adjustments => copy block. */
+ {
+ const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+ if (s0 > SUM_DIFF_THRESHOLD)
+ return COPY_BLOCK;
+ }
+
+ /* Tell above level that block was filtered. */
+ running_avg_y -= running_avg_y_stride * 16;
+ sig -= sig_stride * 16;
+
+ vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
+
+ return FILTER_BLOCK;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
index 5bda78678db..840cb33d957 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
@@ -65,8 +65,10 @@
; unsigned char *pred, int pred_stride)
|vp8_subtract_mby_neon| PROC
push {r4-r7}
+ vpush {d8-d15}
+
mov r12, #4
- ldr r4, [sp, #16] ; pred_stride
+ ldr r4, [sp, #80] ; pred_stride
mov r6, #32 ; "diff" stride x2
add r5, r0, #16 ; second diff pointer
@@ -101,6 +103,7 @@ subtract_mby_loop
subs r12, r12, #1
bne subtract_mby_loop
+ vpop {d8-d15}
pop {r4-r7}
bx lr
ENDP
@@ -112,9 +115,11 @@ subtract_mby_loop
|vp8_subtract_mbuv_neon| PROC
push {r4-r7}
- ldr r4, [sp, #16] ; upred
- ldr r5, [sp, #20] ; vpred
- ldr r6, [sp, #24] ; pred_stride
+ vpush {d8-d15}
+
+ ldr r4, [sp, #80] ; upred
+ ldr r5, [sp, #84] ; vpred
+ ldr r6, [sp, #88] ; pred_stride
add r0, r0, #512 ; short *udiff = diff + 256;
mov r12, #32 ; "diff" stride x2
add r7, r0, #16 ; second diff pointer
@@ -191,6 +196,7 @@ subtract_mby_loop
vst1.16 {q14}, [r0], r12
vst1.16 {q15}, [r7], r12
+ vpop {d8-d15}
pop {r4-r7}
bx lr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
index 5b9f11e5935..d219e2d1424 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -21,6 +21,7 @@
;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
; int sz);
|vp8_memcpy_partial_neon| PROC
+ vpush {d8-d15}
;pld [r1] ;preload pred data
;pld [r1, #128]
;pld [r1, #256]
@@ -64,6 +65,7 @@ extra_copy_neon_loop
bne extra_copy_neon_loop
done_copy_neon_loop
+ vpop {d8-d15}
bx lr
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
index 55edbf5129e..f82af3ee333 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -27,6 +27,8 @@
;from vp8_variance().
|vp8_mse16x16_neon| PROC
+ vpush {q7}
+
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
@@ -62,7 +64,7 @@ mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
- ldr r12, [sp] ;load *sse from stack
+ ldr r12, [sp, #16] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
@@ -71,6 +73,7 @@ mse16x16_neon_loop
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
+ vpop {q7}
bx lr
ENDP
@@ -82,6 +85,8 @@ mse16x16_neon_loop
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
+ vpush {q7}
+
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
@@ -109,6 +114,8 @@ mse16x16_neon_loop
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
+
+ vpop {q7}
bx lr
ENDP
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
index 455a94fbed3..eef2d79e087 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_BITSTREAM_H
-#define __INC_BITSTREAM_H
+#ifndef VP8_ENCODER_BITSTREAM_H_
+#define VP8_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#if HAVE_EDSP
void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
@@ -43,4 +47,8 @@ void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b)
#endif
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_BITSTREAM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
index cf74c7aafc4..34879cf2ab4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_BLOCK_H
-#define __INC_BLOCK_H
+#ifndef VP8_ENCODER_BLOCK_H_
+#define VP8_ENCODER_BLOCK_H_
#include "vp8/common/onyx.h"
#include "vp8/common/blockd.h"
@@ -18,6 +18,10 @@
#include "vp8/common/entropy.h"
#include "vpx_ports/mem.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define MAX_MODES 20
#define MAX_ERROR_BINS 1024
@@ -121,6 +125,7 @@ typedef struct macroblock
int optimize;
int q_index;
+ int increase_denoising;
#if CONFIG_TEMPORAL_DENOISING
MB_PREDICTION_MODE best_sse_inter_mode;
@@ -160,4 +165,8 @@ typedef struct macroblock
} MACROBLOCK;
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_BLOCK_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/boolhuff.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/boolhuff.h
index 39ab586b52b..61142157593 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/boolhuff.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/boolhuff.h
@@ -16,12 +16,16 @@
* Description : Bool Coder header file.
*
****************************************************************************/
-#ifndef __INC_BOOLHUFF_H
-#define __INC_BOOLHUFF_H
+#ifndef VP8_ENCODER_BOOLHUFF_H_
+#define VP8_ENCODER_BOOLHUFF_H_
#include "vpx_ports/mem.h"
#include "vpx/internal/vpx_codec_internal.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef struct
{
unsigned int lowvalue;
@@ -125,4 +129,8 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
br->range = range;
}
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_BOOLHUFF_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_cost.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_cost.h
index e892765c69f..1cd3eec84ac 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_cost.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_cost.h
@@ -8,6 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_ENCODER_DCT_VALUE_COST_H_
+#define VP8_ENCODER_DCT_VALUE_COST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Generated file, included by tokenize.c */
/* Values generated by fill_value_tokens() */
@@ -356,3 +363,9 @@ static const short dct_value_cost[2048*2] =
8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243,
8251, 8257, 8265, 8275
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_DCT_VALUE_COST_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_tokens.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_tokens.h
index ef08eeddc77..c2aadefca79 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_tokens.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct_value_tokens.h
@@ -8,6 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#define VP8_ENCODER_DCT_VALUE_TOKENS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Generated file, included by tokenize.c */
/* Values generated by fill_value_tokens() */
@@ -697,3 +704,9 @@ static const TOKENVALUE dct_value_tokens[2048*2] =
{10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952},
{10, 3954}, {10, 3956}, {10, 3958}, {10, 3960}
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_DCT_VALUE_TOKENS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/defaultcoefcounts.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/defaultcoefcounts.h
index 2c0f3ddf3b3..1e8e80484ae 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/defaultcoefcounts.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -8,6 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Generated file, included by entropy.c */
static const unsigned int default_coef_counts[BLOCK_TYPES]
@@ -221,3 +228,9 @@ static const unsigned int default_coef_counts[BLOCK_TYPES]
},
},
};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
index 7819265471e..1e645fbdfff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
@@ -21,6 +21,7 @@ static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
*/
static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
/*
* The filter function was modified to reduce the computational complexity.
@@ -51,27 +52,32 @@ static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
* [16, 255] 6 7
*/
-int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
- YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
- unsigned int motion_magnitude, int y_offset,
- int uv_offset)
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
+ unsigned char *running_avg_y, int avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising)
{
- unsigned char *sig = signal->thismb;
- int sig_stride = 16;
- unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
- int mc_avg_y_stride = mc_running_avg->y_stride;
- unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
- int avg_y_stride = running_avg->y_stride;
- int r, c, i;
+ unsigned char *running_avg_y_start = running_avg_y;
+ unsigned char *sig_start = sig;
+ int sum_diff_thresh;
+ int r, c;
int sum_diff = 0;
int adj_val[3] = {3, 4, 6};
-
+ int shift_inc1 = 0;
+ int shift_inc2 = 1;
/* If motion_magnitude is small, making the denoiser more aggressive by
- * increasing the adjustment for each level. */
+ * increasing the adjustment for each level. Add another increment for
+ * blocks that are labeled for increase denoising. */
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
{
- for (i = 0; i < 3; i++)
- adj_val[i] += 1;
+ if (increase_denoising) {
+ shift_inc1 = 1;
+ shift_inc2 = 2;
+ }
+ adj_val[0] += shift_inc2;
+ adj_val[1] += shift_inc2;
+ adj_val[2] += shift_inc2;
}
for (r = 0; r < 16; ++r)
@@ -85,8 +91,9 @@ int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
diff = mc_running_avg_y[c] - sig[c];
absdiff = abs(diff);
- /* When |diff| < 4, use pixel value from last denoised raw. */
- if (absdiff <= 3)
+ // When |diff| <= |3 + shift_inc1|, use pixel value from
+ // last denoised raw.
+ if (absdiff <= 3 + shift_inc1)
{
running_avg_y[c] = mc_running_avg_y[c];
sum_diff += diff;
@@ -127,11 +134,12 @@ int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
running_avg_y += avg_y_stride;
}
- if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ sum_diff_thresh= SUM_DIFF_THRESHOLD;
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ if (abs(sum_diff) > sum_diff_thresh)
return COPY_BLOCK;
- vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
- signal->thismb, sig_stride);
+ vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
return FILTER_BLOCK;
}
@@ -192,7 +200,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
int mv_row;
int mv_col;
unsigned int motion_magnitude2;
-
+ unsigned int sse_thresh;
MV_REFERENCE_FRAME frame = x->best_reference_frame;
MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
@@ -277,7 +285,10 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
mv_row = x->best_sse_mv.as_mv.row;
mv_col = x->best_sse_mv.as_mv.col;
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
- if (best_sse > SSE_THRESHOLD || motion_magnitude2
+ sse_thresh = SSE_THRESHOLD;
+ if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+
+ if (best_sse > sse_thresh || motion_magnitude2
> 8 * NOISE_MOTION_THRESHOLD)
{
decision = COPY_BLOCK;
@@ -285,12 +296,18 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
if (decision == FILTER_BLOCK)
{
+ unsigned char *mc_running_avg_y =
+ denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset;
+ int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride;
+ unsigned char *running_avg_y =
+ denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset;
+ int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+
/* Filter. */
- decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
- &denoiser->yv12_running_avg[INTRA_FRAME],
- x,
- motion_magnitude2,
- recon_yoffset, recon_uvoffset);
+ decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
+ running_avg_y, avg_y_stride,
+ x->thismb, 16, motion_magnitude2,
+ x->increase_denoising);
}
if (decision == COPY_BLOCK)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
index b025f5cdf01..ae744d2efc0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
@@ -13,7 +13,12 @@
#include "block.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
enum vp8_denoiser_decision
@@ -39,4 +44,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
int recon_yoffset,
int recon_uvoffset);
-#endif /* VP8_ENCODER_DENOISING_H_ */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_DENOISING_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.h
index 4dd6ba0de18..e185c1035c0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.h
@@ -7,8 +7,12 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef ENCODEFRAME_H
-#define ENCODEFRAME_H
+#ifndef VP8_ENCODER_ENCODEFRAME_H_
+#define VP8_ENCODER_ENCODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
extern void vp8_build_block_offsets(MACROBLOCK *x);
@@ -24,4 +28,8 @@ extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
TOKENEXTRA **t);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_ENCODEFRAME_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.h
index be2141f2c63..a8d0284d29b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.h
@@ -9,13 +9,21 @@
*/
-#ifndef _ENCODEINTRA_H_
-#define _ENCODEINTRA_H_
+#ifndef VP8_ENCODER_ENCODEINTRA_H_
+#define VP8_ENCODER_ENCODEINTRA_H_
#include "onyx_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
void vp8_encode_intra16x16mby(MACROBLOCK *x);
void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
void vp8_encode_intra4x4mby(MACROBLOCK *mb);
void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_ENCODEINTRA_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.h
index 6badf7d901a..0b3ec875e7a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.h
@@ -9,10 +9,14 @@
*/
-#ifndef __INC_ENCODEMB_H
-#define __INC_ENCODEMB_H
+#ifndef VP8_ENCODER_ENCODEMB_H_
+#define VP8_ENCODER_ENCODEMB_H_
#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
void vp8_encode_inter16x16(MACROBLOCK *x);
void vp8_build_dcblock(MACROBLOCK *b);
@@ -23,4 +27,8 @@ void vp8_transform_intra_mby(MACROBLOCK *x);
void vp8_optimize_mby(MACROBLOCK *x);
void vp8_optimize_mbuv(MACROBLOCK *x);
void vp8_encode_inter16x16y(MACROBLOCK *x);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_ENCODEMB_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemv.h
index a6116c133d0..722162ba212 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemv.h
@@ -9,13 +9,21 @@
*/
-#ifndef __INC_ENCODEMV_H
-#define __INC_ENCODEMV_H
+#ifndef VP8_ENCODER_ENCODEMV_H_
+#define VP8_ENCODER_ENCODEMV_H_
#include "onyx_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_write_mvprobs(VP8_COMP *);
void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_ENCODEMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
index 968c7f3652e..98e5a7115db 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
@@ -940,9 +940,9 @@ static int64_t estimate_modemvcost(VP8_COMP *cpi,
/* Crude estimate of overhead cost from modes
* << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
*/
- mode_cost =((((av_pct_inter - av_pct_motion) * zz_cost) +
- (av_pct_motion * motion_cost) +
- (av_intra * intra_cost)) * cpi->common.MBs) * 512;
+ mode_cost = (int64_t)((((av_pct_inter - av_pct_motion) * zz_cost) +
+ (av_pct_motion * motion_cost) +
+ (av_intra * intra_cost)) * cpi->common.MBs) * 512;
return mv_cost + mode_cost;
}
@@ -2310,7 +2310,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
pct_extra = (pct_extra > 20) ? 20 : pct_extra;
cpi->twopass.alt_extra_bits =
- (cpi->twopass.gf_group_bits * pct_extra) / 100;
+ (int)(cpi->twopass.gf_group_bits * pct_extra) / 100;
cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
cpi->twopass.alt_extra_bits /=
((cpi->baseline_gf_interval-1)>>1);
@@ -2386,7 +2386,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
target_frame_size = max_bits;
if (target_frame_size > cpi->twopass.gf_group_bits)
- target_frame_size = cpi->twopass.gf_group_bits;
+ target_frame_size = (int)cpi->twopass.gf_group_bits;
}
/* Adjust error and bits remaining */
@@ -2444,10 +2444,10 @@ void vp8_second_pass(VP8_COMP *cpi)
find_next_key_frame(cpi, &this_frame_copy);
/* Special case: Error error_resilient_mode mode does not make much
- * sense for two pass but with its current meaning but this code is
+ * sense for two pass but with its current meaning this code is
* designed to stop outlandish behaviour if someone does set it when
* using two pass. It effectively disables GF groups. This is
- * temporary code till we decide what should really happen in this
+ * temporary code until we decide what should really happen in this
* case.
*/
if (cpi->oxcf.error_resilient_mode)
@@ -2773,7 +2773,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
kf_group_intra_err += this_frame->intra_error;
kf_group_coded_err += this_frame->coded_error;
- /* load a the next frame's stats */
+ /* Load the next frame's stats. */
vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
input_stats(cpi, this_frame);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.h
index 95e1e5463c6..c409ebca8f1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.h
@@ -9,8 +9,12 @@
*/
-#if !defined __INC_FIRSTPASS_H
-#define __INC_FIRSTPASS_H
+#ifndef VP8_ENCODER_FIRSTPASS_H_
+#define VP8_ENCODER_FIRSTPASS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
extern void vp8_init_first_pass(VP8_COMP *cpi);
extern void vp8_first_pass(VP8_COMP *cpi);
@@ -21,4 +25,8 @@ extern void vp8_second_pass(VP8_COMP *cpi);
extern void vp8_end_second_pass(VP8_COMP *cpi);
extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_FIRSTPASS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/lookahead.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/lookahead.h
index cf56b75b7d5..cad68e639fb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/lookahead.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/lookahead.h
@@ -7,11 +7,15 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef LOOKAHEAD_H
-#define LOOKAHEAD_H
+#ifndef VP8_ENCODER_LOOKAHEAD_H_
+#define VP8_ENCODER_LOOKAHEAD_H_
#include "vpx_scale/yv12config.h"
#include "vpx/vpx_integer.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
struct lookahead_entry
{
YV12_BUFFER_CONFIG img;
@@ -106,4 +110,8 @@ unsigned int
vp8_lookahead_depth(struct lookahead_ctx *ctx);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_LOOKAHEAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
index 0b11ea64a65..54abe76acd2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
@@ -17,6 +17,7 @@
#include <limits.h>
#include <math.h>
#include "vp8/common/findnearmv.h"
+#include "vp8/common/common.h"
#ifdef VP8_ENTROPY_STATS
static int mv_ref_ct [31] [4] [2];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.h
index e36c51543cb..f284f7c3807 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.h
@@ -9,12 +9,16 @@
*/
-#ifndef __INC_MCOMP_H
-#define __INC_MCOMP_H
+#ifndef VP8_ENCODER_MCOMP_H_
+#define VP8_ENCODER_MCOMP_H_
#include "block.h"
#include "vp8/common/variance.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#ifdef VP8_ENTROPY_STATS
extern void init_mv_ref_counts();
extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
@@ -104,4 +108,8 @@ typedef int (*vp8_diamond_search_fn_t)
int_mv *center_mv
);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_MCOMP_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h
index 99ef119d5dd..9281551c8d5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h
@@ -9,9 +9,17 @@
*/
-#ifndef __INC_MODECOSTS_H
-#define __INC_MODECOSTS_H
+#ifndef VP8_ENCODER_MODECOSTS_H_
+#define VP8_ENCODER_MODECOSTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
void vp8_init_mode_costs(VP8_COMP *x);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_MODECOSTS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.c
index 71218cca146..8d96445f53c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.c
@@ -15,6 +15,7 @@
#include "mr_dissim.h"
#include "vpx_mem/vpx_mem.h"
#include "rdopt.h"
+#include "vp8/common/common.h"
void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.h
index f8cb135d52c..5a59ce62a61 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mr_dissim.h
@@ -9,12 +9,20 @@
*/
-#ifndef __INC_MR_DISSIM_H
-#define __INC_MR_DISSIM_H
+#ifndef VP8_ENCODER_MR_DISSIM_H_
+#define VP8_ENCODER_MR_DISSIM_H_
#include "vpx_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_MR_DISSIM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index 7c079759971..e95e44fd521 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -19,7 +19,7 @@
#include "vp8/common/alloccommon.h"
#include "mcomp.h"
#include "firstpass.h"
-#include "psnr.h"
+#include "vpx/internal/vpx_psnr.h"
#include "vpx_scale/vpx_scale.h"
#include "vp8/common/extend.h"
#include "ratectrl.h"
@@ -1401,6 +1401,7 @@ static void update_layer_contexts (VP8_COMP *cpi)
unsigned int i;
double prev_layer_framerate=0;
+ assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS);
for (i=0; i<oxcf->number_of_layers; i++)
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
@@ -1623,6 +1624,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
cpi->oxcf.maximum_buffer_size =
rescale((int)cpi->oxcf.maximum_buffer_size,
cpi->oxcf.target_bandwidth, 1000);
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+ cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+ cpi->buffer_level = cpi->bits_off_target;
+ }
/* Set up frame rate and related parameters rate control values. */
vp8_new_framerate(cpi, cpi->framerate);
@@ -1754,8 +1761,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
}
+#ifndef M_LOG2_E
#define M_LOG2_E 0.693147180559945309417
+#endif
#define log2f(x) (log (x) / (float) M_LOG2_E)
+
static void cal_mvsadcosts(int *mvsadcost[2])
{
int i = 1;
@@ -2164,10 +2174,12 @@ void vp8_remove_compressor(VP8_COMP **ptr)
8.0 / 1000.0 / time_encoded;
double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
lst_yv12->y_width * lst_yv12->y_height;
- double total_psnr = vp8_mse2psnr(samples, 255.0,
- cpi->total_error2[i]);
- double total_psnr2 = vp8_mse2psnr(samples, 255.0,
- cpi->total_error2_p[i]);
+ double total_psnr =
+ vpx_sse_to_psnr(samples, 255.0,
+ cpi->total_error2[i]);
+ double total_psnr2 =
+ vpx_sse_to_psnr(samples, 255.0,
+ cpi->total_error2_p[i]);
double total_ssim = 100 * pow(cpi->sum_ssim[i] /
cpi->sum_weights[i], 8.0);
@@ -2184,9 +2196,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
{
double samples = 3.0 / 2 * cpi->count *
lst_yv12->y_width * lst_yv12->y_height;
- double total_psnr = vp8_mse2psnr(samples, 255.0,
- cpi->total_sq_error);
- double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+ double total_psnr = vpx_sse_to_psnr(samples, 255.0,
+ cpi->total_sq_error);
+ double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
cpi->total_sq_error2);
double total_ssim = 100 * pow(cpi->summed_quality /
cpi->summed_weights, 8.0);
@@ -2516,8 +2528,8 @@ static void generate_psnr_packet(VP8_COMP *cpi)
pkt.data.psnr.samples[3] = width * height;
for (i = 0; i < 4; i++)
- pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
- (double)(pkt.data.psnr.sse[i]));
+ pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0,
+ (double)(pkt.data.psnr.sse[i]));
vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
}
@@ -2675,8 +2687,8 @@ static int resize_key_frame(VP8_COMP *cpi)
VP8_COMMON *cm = &cpi->common;
/* Do we need to apply resampling for one pass cbr.
- * In one pass this is more limited than in two pass cbr
- * The test and any change is only made one per key frame sequence
+ * In one pass this is more limited than in two pass cbr.
+ * The test and any change is only made once per key frame sequence.
*/
if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
{
@@ -2699,7 +2711,7 @@ static int resize_key_frame(VP8_COMP *cpi)
cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
}
- /* Get the new hieght and width */
+ /* Get the new height and width */
Scale2Ratio(cm->horiz_scale, &hr, &hs);
Scale2Ratio(cm->vert_scale, &vr, &vs);
new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
@@ -3574,7 +3586,8 @@ static void encode_frame_to_data_rate
for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
- lc->bits_off_target += cpi->av_per_frame_bandwidth;
+ lc->bits_off_target += (int)(lc->target_bandwidth /
+ lc->framerate);
if (lc->bits_off_target > lc->maximum_buffer_size)
lc->bits_off_target = lc->maximum_buffer_size;
lc->buffer_level = lc->bits_off_target;
@@ -3807,7 +3820,7 @@ static void encode_frame_to_data_rate
/* Setup background Q adjustment for error resilient mode.
* For multi-layer encodes only enable this for the base layer.
- */
+ */
if (cpi->cyclic_refresh_mode_enabled)
{
if (cpi->current_layer==0)
@@ -4620,45 +4633,43 @@ static void encode_frame_to_data_rate
vp8_clear_system_state();
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
- "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
- "%10.3f %8d\n",
+ fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+ "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+ "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n",
cpi->common.current_video_frame, cpi->this_frame_target,
cpi->projected_frame_size,
(cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
+ cpi->total_target_vs_actual,
cpi->buffer_level,
(cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
- (int)cpi->total_actual_bits, cm->base_qindex,
+ cpi->total_actual_bits, cm->base_qindex,
cpi->active_best_quality, cpi->active_worst_quality,
cpi->ni_av_qi, cpi->cq_target_quality,
- cpi->zbin_over_quant,
cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
cm->frame_type, cpi->gfu_boost,
cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
+ cpi->twopass.bits_left,
cpi->twopass.total_left_stats.coded_error,
(double)cpi->twopass.bits_left /
cpi->twopass.total_left_stats.coded_error,
cpi->tot_recode_hits);
else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
- "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
- "%8d\n",
- cpi->common.current_video_frame,
- cpi->this_frame_target, cpi->projected_frame_size,
+ fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+ "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+ "%8.2lf %"PRId64" %10.3lf %8d\n",
+ cpi->common.current_video_frame, cpi->this_frame_target,
+ cpi->projected_frame_size,
(cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
+ cpi->total_target_vs_actual,
cpi->buffer_level,
(cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
- (int)cpi->total_actual_bits, cm->base_qindex,
+ cpi->total_actual_bits, cm->base_qindex,
cpi->active_best_quality, cpi->active_worst_quality,
cpi->ni_av_qi, cpi->cq_target_quality,
- cpi->zbin_over_quant,
cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
cm->frame_type, cpi->gfu_boost,
cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
+ cpi->twopass.bits_left,
cpi->twopass.total_left_stats.coded_error,
cpi->tot_recode_hits);
@@ -4666,7 +4677,6 @@ static void encode_frame_to_data_rate
{
FILE *fmodes = fopen("Modes.stt", "a");
- int i;
fprintf(fmodes, "%6d:%1d:%1d:%1d ",
cpi->common.current_video_frame,
@@ -4810,33 +4820,11 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
}
#endif
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-#if HAVE_NEON
-extern void vp8_push_neon(int64_t *store);
-extern void vp8_pop_neon(int64_t *store);
-#endif
-
-
int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
{
-#if HAVE_NEON
- int64_t store_reg[8];
-#if CONFIG_RUNTIME_CPU_DETECT
- VP8_COMMON *cm = &cpi->common;
-#endif
-#endif
struct vpx_usec_timer timer;
int res = 0;
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_push_neon(store_reg);
- }
-#endif
-
vpx_usec_timer_start(&timer);
/* Reinit the lookahead buffer if the frame size changes */
@@ -4853,15 +4841,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
vpx_usec_timer_mark(&timer);
cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_pop_neon(store_reg);
- }
-#endif
-
return res;
}
@@ -4882,9 +4861,6 @@ static int frame_is_reference(const VP8_COMP *cpi)
int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
{
-#if HAVE_NEON
- int64_t store_reg[8];
-#endif
VP8_COMMON *cm;
struct vpx_usec_timer tsctimer;
struct vpx_usec_timer ticktimer;
@@ -4904,15 +4880,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
cpi->common.error.setjmp = 1;
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_push_neon(store_reg);
- }
-#endif
-
vpx_usec_timer_start(&cmptimer);
cpi->source = NULL;
@@ -4995,14 +4962,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
#endif
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_pop_neon(store_reg);
- }
-#endif
return -1;
}
@@ -5065,6 +5024,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
unsigned int i;
/* Update frame rates for each layer */
+ assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS);
for (i=0; i<cpi->oxcf.number_of_layers; i++)
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
@@ -5267,7 +5227,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
int y_samples = orig->y_height * orig->y_width ;
int uv_samples = orig->uv_height * orig->uv_width ;
int t_samples = y_samples + 2 * uv_samples;
- double sq_error, sq_error2;
+ double sq_error;
ye = calc_plane_error(orig->y_buffer, orig->y_stride,
recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
@@ -5280,16 +5240,17 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
sq_error = (double)(ye + ue + ve);
- frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+ frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error);
- cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
- cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
- cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
+ cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye);
+ cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue);
+ cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve);
cpi->total_sq_error += sq_error;
cpi->total += frame_psnr;
#if CONFIG_POSTPROC
{
YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+ double sq_error2;
double frame_psnr2, frame_ssim2 = 0;
double weight = 0;
@@ -5307,14 +5268,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
sq_error2 = (double)(ye + ue + ve);
- frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
+ frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2);
- cpi->totalp_y += vp8_mse2psnr(y_samples,
- 255.0, (double)ye);
- cpi->totalp_u += vp8_mse2psnr(uv_samples,
- 255.0, (double)ue);
- cpi->totalp_v += vp8_mse2psnr(uv_samples,
- 255.0, (double)ve);
+ cpi->totalp_y += vpx_sse_to_psnr(y_samples,
+ 255.0, (double)ye);
+ cpi->totalp_u += vpx_sse_to_psnr(uv_samples,
+ 255.0, (double)ue);
+ cpi->totalp_v += vpx_sse_to_psnr(uv_samples,
+ 255.0, (double)ve);
cpi->total_sq_error2 += sq_error2;
cpi->totalp += frame_psnr2;
@@ -5405,15 +5366,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
#endif
#endif
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->cpu_caps & HAS_NEON)
-#endif
- {
- vp8_pop_neon(store_reg);
- }
-#endif
-
cpi->common.error.setjmp = 0;
return 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index 3ab0fe8bfa1..df17dff3478 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_VP8_INT_H
-#define __INC_VP8_INT_H
+#ifndef VP8_ENCODER_ONYX_INT_H_
+#define VP8_ENCODER_ONYX_INT_H_
#include <stdio.h>
#include "vpx_config.h"
@@ -33,6 +33,10 @@
#include "vp8/encoder/denoising.h"
#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#define MIN_GF_INTERVAL 4
#define DEFAULT_GF_INTERVAL 7
@@ -57,9 +61,6 @@
#define VP8_TEMPORAL_ALT_REF 1
#endif
-#define MAX(x,y) (((x)>(y))?(x):(y))
-#define MIN(x,y) (((x)<(y))?(x):(y))
-
typedef struct
{
int kf_indicated;
@@ -721,4 +722,8 @@ void vp8_set_speed_features(VP8_COMP *cpi);
"Failed to allocate "#lval);\
} while(0)
#endif
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_ONYX_INT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
index c5279fed2a8..cf6a82f5a8f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
@@ -14,6 +14,7 @@
#include "onyx_int.h"
#include "modecosts.h"
#include "encodeintra.h"
+#include "vp8/common/common.h"
#include "vp8/common/entropymode.h"
#include "pickinter.h"
#include "vp8/common/findnearmv.h"
@@ -1176,6 +1177,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
x->best_reference_frame = best_mbmode.ref_frame;
best_sse = best_rd_sse;
}
+ x->increase_denoising = 0;
vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
recon_yoffset, recon_uvoffset);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.h
index 35011cab358..cf3b1f8d49c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.h
@@ -9,11 +9,15 @@
*/
-#ifndef __INC_PICKINTER_H
-#define __INC_PICKINTER_H
+#ifndef VP8_ENCODER_PICKINTER_H_
+#define VP8_ENCODER_PICKINTER_H_
#include "vpx_config.h"
#include "vp8/common/onyxc_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
int recon_uvoffset, int *returnrate,
int *returndistortion, int *returnintra,
@@ -24,4 +28,8 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
const vp8_variance_fn_ptr_t *vfp,
unsigned int *sse,
int_mv this_mv);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_PICKINTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.c
deleted file mode 100644
index b3a3d955290..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "math.h"
-#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
-
-#define MAX_PSNR 100
-
-double vp8_mse2psnr(double Samples, double Peak, double Mse)
-{
- double psnr;
-
- if ((double)Mse > 0.0)
- psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
- else
- psnr = MAX_PSNR; /* Limit to prevent / 0 */
-
- if (psnr > MAX_PSNR)
- psnr = MAX_PSNR;
-
- return psnr;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.h
deleted file mode 100644
index 7f6269abe4c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/psnr.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PSNR_H
-#define __INC_PSNR_H
-
-extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
-
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
index d55496c5ffe..c739b2627b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_QUANTIZE_H
-#define __INC_QUANTIZE_H
+#ifndef VP8_ENCODER_QUANTIZE_H_
+#define VP8_ENCODER_QUANTIZE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
struct VP8_COMP;
struct macroblock;
@@ -20,4 +24,8 @@ extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip);
extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_QUANTIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
index 1e8259cf488..c51650c3c26 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
@@ -174,14 +174,6 @@ static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
600, 600, 600, 600, 600, 600, 600, 600,
};
-/* % adjustment to target kf size based on seperation from previous frame */
-static const int kf_boost_seperation_adjustment[16] =
-{
- 30, 40, 50, 55, 60, 65, 70, 75,
- 80, 85, 90, 95, 100, 100, 100, 100,
-};
-
-
static const int gf_adjust_table[101] =
{
100,
@@ -956,6 +948,21 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
cpi->buffer_level = cpi->bits_off_target;
+
+ if (cpi->oxcf.number_of_layers > 1) {
+ unsigned int i;
+
+ // Propagate bits saved by dropping the frame to higher layers.
+ for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers;
+ i++) {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ lc->bits_off_target += (int)(lc->target_bandwidth /
+ lc->framerate);
+ if (lc->bits_off_target > lc->maximum_buffer_size)
+ lc->bits_off_target = lc->maximum_buffer_size;
+ lc->buffer_level = lc->bits_off_target;
+ }
+ }
}
}
@@ -1223,7 +1230,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
{
Q = cpi->oxcf.gold_q;
}
-
}
else
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h
index c43f08d6dd2..829697f391f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h
@@ -9,10 +9,15 @@
*/
-#if !defined __INC_RATECTRL_H
+#ifndef VP8_ENCODER_RATECTRL_H_
+#define VP8_ENCODER_RATECTRL_H_
#include "onyx_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
extern void vp8_save_coding_context(VP8_COMP *cpi);
extern void vp8_restore_coding_context(VP8_COMP *cpi);
@@ -25,4 +30,8 @@ extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_
/* return of 0 means drop frame */
extern int vp8_pick_frame_size(VP8_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_RATECTRL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
index 5016cc4220f..387701c5733 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
@@ -528,19 +528,16 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a,
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
-
+ assert(eob <= 16);
for (; c < eob; c++)
{
- int v = QC(c);
- int t = vp8_dct_value_tokens_ptr[v].Token;
+ const int v = qcoeff_ptr[vp8_default_zig_zag1d[c]];
+ const int t = vp8_dct_value_tokens_ptr[v].Token;
cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
cost += vp8_dct_value_cost_ptr[v];
pt = vp8_prev_token_class[t];
}
-# undef QC
-
if (c < 16)
cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h
index 1e11fa77dfc..fe21b8e283e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h
@@ -9,8 +9,12 @@
*/
-#ifndef __INC_RDOPT_H
-#define __INC_RDOPT_H
+#ifndef VP8_ENCODER_RDOPT_H_
+#define VP8_ENCODER_RDOPT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
@@ -130,4 +134,8 @@ extern void vp8_mv_pred
);
void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_RDOPT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.h
index 12815b087f4..6b5500594e6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.h
@@ -8,9 +8,21 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP8_ENCODER_SEGMENTATION_H_
+#define VP8_ENCODER_SEGMENTATION_H_
#include "string.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_SEGMENTATION_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
index 7e3af71ecba..4dc0d959221 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
@@ -16,7 +16,6 @@
#include "vp8/common/alloccommon.h"
#include "mcomp.h"
#include "firstpass.h"
-#include "psnr.h"
#include "vpx_scale/vpx_scale.h"
#include "vp8/common/extend.h"
#include "ratectrl.h"
@@ -99,6 +98,7 @@ void vp8_temporal_filter_apply_c
unsigned int i, j, k;
int modifier;
int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
for (i = 0,k = 0; i < block_size; i++)
{
@@ -115,7 +115,7 @@ void vp8_temporal_filter_apply_c
*/
modifier *= modifier;
modifier *= 3;
- modifier += 1 << (strength - 1);
+ modifier += rounding;
modifier >>= strength;
if (modifier > 16)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
index 11559a72083..2dc8205278b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
@@ -213,6 +213,7 @@ static void tokenize1st_order_b
/* Luma */
for (block = 0; block < 16; block++, b++)
{
+ const int eob = *b->eob;
tmp1 = vp8_block2above[block];
tmp2 = vp8_block2left[block];
qcoeff_ptr = b->qcoeff;
@@ -223,7 +224,7 @@ static void tokenize1st_order_b
c = type ? 0 : 1;
- if(c >= *b->eob)
+ if(c >= eob)
{
/* c = band for this case */
t->Token = DCT_EOB_TOKEN;
@@ -250,7 +251,8 @@ static void tokenize1st_order_b
t++;
c++;
- for (; c < *b->eob; c++)
+ assert(eob <= 16);
+ for (; c < eob; c++)
{
rc = vp8_default_zig_zag1d[c];
band = vp8_coef_bands[c];
@@ -286,6 +288,7 @@ static void tokenize1st_order_b
/* Chroma */
for (block = 16; block < 24; block++, b++)
{
+ const int eob = *b->eob;
tmp1 = vp8_block2above[block];
tmp2 = vp8_block2left[block];
qcoeff_ptr = b->qcoeff;
@@ -294,7 +297,7 @@ static void tokenize1st_order_b
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- if(!(*b->eob))
+ if(!eob)
{
/* c = band for this case */
t->Token = DCT_EOB_TOKEN;
@@ -321,7 +324,8 @@ static void tokenize1st_order_b
t++;
c = 1;
- for (; c < *b->eob; c++)
+ assert(eob <= 16);
+ for (; c < eob; c++)
{
rc = vp8_default_zig_zag1d[c];
band = vp8_coef_bands[c];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.h
index 1e6cea11465..b73a9ee1c89 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.h
@@ -9,12 +9,16 @@
*/
-#ifndef tokenize_h
-#define tokenize_h
+#ifndef VP8_ENCODER_TOKENIZE_H_
+#define VP8_ENCODER_TOKENIZE_H_
#include "vp8/common/entropy.h"
#include "block.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
void vp8_tokenize_initialize();
typedef struct
@@ -47,4 +51,8 @@ extern const short *const vp8_dct_value_cost_ptr;
*/
extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
-#endif /* tokenize_h */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_TOKENIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/treewriter.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/treewriter.h
index 48574f33cba..cfb2730ab3b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/treewriter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/treewriter.h
@@ -9,8 +9,8 @@
*/
-#ifndef __INC_TREEWRITER_H
-#define __INC_TREEWRITER_H
+#ifndef VP8_ENCODER_TREEWRITER_H_
+#define VP8_ENCODER_TREEWRITER_H_
/* Trees map alphabets into huffman-like codes suitable for an arithmetic
bit coder. Timothy S Murphy 11 October 2004 */
@@ -19,6 +19,10 @@
#include "boolhuff.h" /* for now */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef BOOL_CODER vp8_writer;
#define vp8_write vp8_encode_bool
@@ -123,4 +127,8 @@ void vp8_cost_tokens2(
int *Costs, const vp8_prob *, vp8_tree, int
);
+#ifdef __cplusplus
+} // extern "C"
#endif
+
+#endif // VP8_ENCODER_TREEWRITER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
index cceb8263f24..5112f891e91 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -22,26 +22,28 @@ union sum_union {
signed char e[16];
};
-int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
- YV12_BUFFER_CONFIG *running_avg,
- MACROBLOCK *signal, unsigned int motion_magnitude,
- int y_offset, int uv_offset)
+int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
+ int mc_avg_y_stride,
+ unsigned char *running_avg_y, int avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising)
{
- unsigned char *sig = signal->thismb;
- int sig_stride = 16;
- unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
- int mc_avg_y_stride = mc_running_avg->y_stride;
- unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
- int avg_y_stride = running_avg->y_stride;
+ unsigned char *running_avg_y_start = running_avg_y;
+ unsigned char *sig_start = sig;
+ int sum_diff_thresh;
int r;
+ int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
__m128i acc_diff = _mm_setzero_si128();
const __m128i k_0 = _mm_setzero_si128();
- const __m128i k_4 = _mm_set1_epi8(4);
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
const __m128i k_8 = _mm_set1_epi8(8);
const __m128i k_16 = _mm_set1_epi8(16);
/* Modify each level's adjustment according to motion_magnitude. */
const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+ 7 + shift_inc : 6);
/* Difference between level 3 and level 2 is 2. */
const __m128i l32 = _mm_set1_epi8(2);
/* Difference between level 2 and level 1 is 1. */
@@ -108,13 +110,14 @@ int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
+ s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+ s.e[12] + s.e[13] + s.e[14] + s.e[15];
- if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ sum_diff_thresh = SUM_DIFF_THRESHOLD;
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ if (abs(sum_diff) > sum_diff_thresh)
{
return COPY_BLOCK;
}
}
- vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
- signal->thismb, sig_stride);
+ vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
return FILTER_BLOCK;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.asm
deleted file mode 100644
index 7b1dc119f08..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_fast_quantize_b_ssse3 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-;
-
-global sym(vp8_fast_quantize_b_ssse3) PRIVATE
-sym(vp8_fast_quantize_b_ssse3):
- push rbp
- mov rbp, rsp
- GET_GOT rbx
-
-%if ABI_IS_32BIT
- push rdi
- push rsi
-%else
- %if LIBVPX_YASM_WIN64
- push rdi
- push rsi
- %endif
-%endif
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %if LIBVPX_YASM_WIN64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rax, [rdi + vp8_block_coeff]
- mov rcx, [rdi + vp8_block_round]
- mov rdx, [rdi + vp8_block_quant_fast]
-
- ; coeff
- movdqa xmm0, [rax]
- movdqa xmm4, [rax + 16]
-
- ; round
- movdqa xmm2, [rcx]
- movdqa xmm3, [rcx + 16]
-
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
-
- ; sz = z >> 15
- psraw xmm0, 15
- psraw xmm4, 15
-
- pabsw xmm1, xmm1
- pabsw xmm5, xmm5
-
- paddw xmm1, xmm2
- paddw xmm5, xmm3
-
- ; quant_fast
- pmulhw xmm1, [rdx]
- pmulhw xmm5, [rdx + 16]
-
- mov rax, [rsi + vp8_blockd_qcoeff]
- mov rdi, [rsi + vp8_blockd_dequant]
- mov rcx, [rsi + vp8_blockd_dqcoeff]
-
- movdqa xmm2, xmm1 ;store y for getting eob
- movdqa xmm3, xmm5
-
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- movdqa [rax], xmm1
- movdqa [rax + 16], xmm5
-
- movdqa xmm0, [rdi]
- movdqa xmm4, [rdi + 16]
-
- pmullw xmm0, xmm1
- pmullw xmm4, xmm5
- pxor xmm1, xmm1
-
- pcmpgtw xmm2, xmm1 ;calculate eob
- pcmpgtw xmm3, xmm1
- packsswb xmm2, xmm3
- pshufb xmm2, [GLOBAL(zz_shuf)]
-
- pmovmskb edx, xmm2
-
- movdqa [rcx], xmm0 ;store dqcoeff
- movdqa [rcx + 16], xmm4 ;store dqcoeff
- mov rcx, [rsi + vp8_blockd_eob]
-
- bsr eax, edx ;count 0
- add eax, 1
-
- cmp edx, 0 ;if all 0, eob=0
- cmove eax, edx
-
- mov BYTE PTR [rcx], al ;store eob
-
- ; begin epilog
-%if ABI_IS_32BIT
- pop rsi
- pop rdi
-%else
- %if LIBVPX_YASM_WIN64
- pop rsi
- pop rdi
- %endif
-%endif
-
- RESTORE_GOT
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
- db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c
new file mode 100644
index 00000000000..448217ff412
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> /* SSSE3 */
+
+#include "vp8/encoder/block.h"
+
+/* bitscan reverse (bsr) */
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+static int bsr(int mask) {
+ int eob;
+ _BitScanReverse(&eob, mask);
+ eob++;
+ if (mask == 0)
+ eob = 0;
+ return eob;
+}
+#else
+static int bsr(int mask) {
+ int eob;
+#if defined(__GNUC__) && __GNUC__
+ __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#endif
+ eob++;
+ if (mask == 0)
+ eob = 0;
+ return eob;
+}
+#endif
+
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
+ int eob, mask;
+
+ __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+ __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+ __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+ __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+ __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+ __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+ __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+ __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+ __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
+
+ DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) =
+ { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
+ __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
+
+ /* sign of z: z >> 15 */
+ sz0 = _mm_srai_epi16(z0, 15);
+ sz1 = _mm_srai_epi16(z1, 15);
+
+ /* x = abs(z) */
+ x0 = _mm_abs_epi16(z0);
+ x1 = _mm_abs_epi16(z1);
+
+ /* x += round */
+ x0 = _mm_add_epi16(x0, round0);
+ x1 = _mm_add_epi16(x1, round1);
+
+ /* y = (x * quant) >> 16 */
+ y0 = _mm_mulhi_epi16(x0, quant_fast0);
+ y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+ /* ASM saves Y for EOB */
+ /* I think we can ignore that because adding the sign doesn't change anything
+ * and multiplying 0 by dequant is OK as well */
+ abs0 = y0;
+ abs1 = y1;
+
+ /* Restore the sign bit. */
+ y0 = _mm_xor_si128(y0, sz0);
+ y1 = _mm_xor_si128(y1, sz1);
+ x0 = _mm_sub_epi16(y0, sz0);
+ x1 = _mm_sub_epi16(y1, sz1);
+
+ /* qcoeff = x */
+ _mm_store_si128((__m128i *)(d->qcoeff), x0);
+ _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+ /* x * dequant */
+ x0 = _mm_mullo_epi16(x0, dequant0);
+ x1 = _mm_mullo_epi16(x1, dequant1);
+
+ /* dqcoeff = x * dequant */
+ _mm_store_si128((__m128i *)(d->dqcoeff), x0);
+ _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1);
+
+ zeros = _mm_setzero_si128();
+
+ x0 = _mm_cmpgt_epi16(abs0, zeros);
+ x1 = _mm_cmpgt_epi16(abs1, zeros);
+
+ x = _mm_packs_epi16(x0, x1);
+
+ x = _mm_shuffle_epi8(x, zig_zag);
+
+ mask = _mm_movemask_epi8(x);
+
+ eob = bsr(mask);
+
+ *d->eob = 0xFF & eob;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
index f98eb318d43..8282547ea2d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
@@ -47,7 +47,7 @@ VP8_COMMON_SRCS-yes += common/quant_common.h
VP8_COMMON_SRCS-yes += common/reconinter.h
VP8_COMMON_SRCS-yes += common/reconintra4x4.h
VP8_COMMON_SRCS-yes += common/rtcd.c
-VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP8_COMMON_SRCS-yes += common/rtcd_defs.pl
VP8_COMMON_SRCS-yes += common/setupintrarecon.h
VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
VP8_COMMON_SRCS-yes += common/systemdependent.h
@@ -129,7 +129,6 @@ VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c
# common (c)
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c
@@ -159,36 +158,29 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/save_reg_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
+#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
+# common (neon intrinsics)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
+
+$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
index 19e9d270183..501dd3eb4ac 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
@@ -414,7 +414,6 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
printf("Sharpness: %d\n", oxcf->Sharpness);
printf("cpu_used: %d\n", oxcf->cpu_used);
printf("Mode: %d\n", oxcf->Mode);
- printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file);
printf("auto_key: %d\n", oxcf->auto_key);
printf("key_freq: %d\n", oxcf->key_freq);
printf("end_usage: %d\n", oxcf->end_usage);
@@ -751,9 +750,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
if (!ctx->cfg.rc_target_bitrate)
return res;
- if (!ctx->cfg.rc_target_bitrate)
- return res;
-
if (img)
res = validate_img(ctx, img);
@@ -890,7 +886,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
/* Add the frame packet to the list of returned packets. */
- round = (vpx_codec_pts_t)1000000
+ round = (vpx_codec_pts_t)10000000
* ctx->cfg.g_timebase.num / 2 - 1;
delta = (dst_end_time_stamp - dst_time_stamp);
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
@@ -1239,6 +1235,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
0, /* rc_dropframe_thresh */
0, /* rc_resize_allowed */
+ 1, /* rc_scaled_width */
+ 1, /* rc_scaled_height */
60, /* rc_resize_down_thresold */
30, /* rc_resize_up_thresold */
@@ -1266,10 +1264,10 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
128, /* kf_max_dist */
#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
- 1, /* g_delete_first_pass_file */
"vp8.fpf" /* first pass filename */
#endif
VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+ {0}, /* ss_target_bitrate */
1, /* ts_number_layers */
{0}, /* ts_target_bitrate */
{0}, /* ts_rate_decimator */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
index 871b8d38579..10cbc6a583a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
@@ -16,9 +16,10 @@
#include "vpx/vp8dx.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_version.h"
+#include "common/alloccommon.h"
+#include "common/common.h"
#include "common/onyxd.h"
#include "decoder/onyxd_int.h"
-#include "common/alloccommon.h"
#include "vpx_mem/vpx_mem.h"
#if CONFIG_ERROR_CONCEALMENT
#include "decoder/error_concealment.h"
@@ -41,19 +42,11 @@ typedef enum
static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
-static const mem_req_t vp8_mem_req_segs[] =
-{
- {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
- {VP8_SEG_MAX, 0, 0, 0, NULL}
-};
-
struct vpx_codec_alg_priv
{
vpx_codec_priv_t base;
- vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs)-1];
vpx_codec_dec_cfg_t cfg;
vp8_stream_info_t si;
- int defer_alloc;
int decoder_init;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
@@ -64,7 +57,7 @@ struct vpx_codec_alg_priv
int dbg_color_b_modes_flag;
int dbg_display_mv_flag;
#endif
- vp8_decrypt_cb *decrypt_cb;
+ vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
int img_setup;
@@ -84,19 +77,14 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
return sizeof(vpx_codec_alg_priv_t);
}
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
{
- int i;
-
- ctx->priv = mmap->base;
+ ctx->priv =
+ (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
+ vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
ctx->priv->sz = sizeof(*ctx->priv);
ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = mmap->base;
-
- for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
- ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
-
- ctx->priv->alg_priv->mmaps[0] = *mmap;
+ ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
ctx->priv->alg_priv->decrypt_cb = NULL;
ctx->priv->alg_priv->decrypt_state = NULL;
@@ -110,11 +98,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
}
}
-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
-{
- /* nothing to clean up */
-}
-
static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data)
{
@@ -129,17 +112,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
*/
if (!ctx->priv)
{
- vpx_codec_mmap_t mmap;
-
- mmap.id = vp8_mem_req_segs[0].id;
- mmap.sz = sizeof(vpx_codec_alg_priv_t);
- mmap.align = vp8_mem_req_segs[0].align;
- mmap.flags = vp8_mem_req_segs[0].flags;
-
- res = vpx_mmap_alloc(&mmap);
- if (res != VPX_CODEC_OK) return res;
-
- vp8_init_ctx(ctx, &mmap);
+ vp8_init_ctx(ctx);
/* initialize number of fragments to zero */
ctx->priv->alg_priv->fragments.count = 0;
@@ -148,7 +121,6 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
(ctx->priv->alg_priv->base.init_flags &
VPX_CODEC_USE_INPUT_FRAGMENTS);
- ctx->priv->alg_priv->defer_alloc = 1;
/*post processing level initialized to do nothing */
}
@@ -175,15 +147,9 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
{
- int i;
-
vp8_remove_decoder_instances(&ctx->yv12_frame_buffers);
- for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--)
- {
- if (ctx->mmaps[i].dtor)
- ctx->mmaps[i].dtor(&ctx->mmaps[i]);
- }
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
@@ -191,7 +157,7 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
unsigned int data_sz,
vpx_codec_stream_info_t *si,
- vp8_decrypt_cb *decrypt_cb,
+ vpx_decrypt_cb decrypt_cb,
void *decrypt_state)
{
vpx_codec_err_t res = VPX_CODEC_OK;
@@ -212,7 +178,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
const uint8_t *clear = data;
if (decrypt_cb)
{
- int n = data_sz > 10 ? 10 : data_sz;
+ int n = MIN(sizeof(clear_buffer), data_sz);
decrypt_cb(decrypt_state, data, clear_buffer, n);
clear = clear_buffer;
}
@@ -389,74 +355,40 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if ((ctx->si.h != h) || (ctx->si.w != w))
resolution_change = 1;
- /* Perform deferred allocations, if required */
- if (!res && ctx->defer_alloc)
- {
- int i;
-
- for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++)
- {
- vpx_codec_dec_cfg_t cfg;
-
- cfg.w = ctx->si.w;
- cfg.h = ctx->si.h;
- ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
- ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
- ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
-
- if (!ctx->mmaps[i].sz)
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
- ctx->base.init_flags);
-
- res = vpx_mmap_alloc(&ctx->mmaps[i]);
- }
-
- if (!res)
- vp8_finalize_mmaps(ctx);
-
- ctx->defer_alloc = 0;
- }
-
/* Initialize the decoder instance on the first frame*/
if (!res && !ctx->decoder_init)
{
- res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
- vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
- ctx->base.init_flags);
-
- if (!res)
- {
- VP8D_CONFIG oxcf;
-
- oxcf.Width = ctx->si.w;
- oxcf.Height = ctx->si.h;
- oxcf.Version = 9;
- oxcf.postprocess = 0;
- oxcf.max_threads = ctx->cfg.threads;
- oxcf.error_concealment =
- (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
-
- /* If postprocessing was enabled by the application and a
- * configuration has not been provided, default it.
- */
- if (!ctx->postproc_cfg_set
- && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
- {
- ctx->postproc_cfg.post_proc_flag =
- VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
- ctx->postproc_cfg.deblocking_level = 4;
- ctx->postproc_cfg.noise_level = 0;
- }
-
- res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
- ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb;
- ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state;
- }
-
- ctx->decoder_init = 1;
+ VP8D_CONFIG oxcf;
+
+ oxcf.Width = ctx->si.w;
+ oxcf.Height = ctx->si.h;
+ oxcf.Version = 9;
+ oxcf.postprocess = 0;
+ oxcf.max_threads = ctx->cfg.threads;
+ oxcf.error_concealment =
+ (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
+
+ /* If postprocessing was enabled by the application and a
+ * configuration has not been provided, default it.
+ */
+ if (!ctx->postproc_cfg_set
+ && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+ ctx->postproc_cfg.post_proc_flag =
+ VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
+ ctx->postproc_cfg.deblocking_level = 4;
+ ctx->postproc_cfg.noise_level = 0;
+ }
+
+ res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
+ ctx->decoder_init = 1;
}
+ /* Set these even if already initialized. The caller may have changed the
+ * decrypt config between frames.
+ */
+ ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb;
+ ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state;
+
if (!res)
{
VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
@@ -618,89 +550,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
return img;
}
-
-static
-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
- vpx_codec_mmap_t *mmap,
- vpx_codec_iter_t *iter)
-{
- vpx_codec_err_t res;
- const mem_req_t *seg_iter = *iter;
-
- /* Get address of next segment request */
- do
- {
- if (!seg_iter)
- seg_iter = vp8_mem_req_segs;
- else if (seg_iter->id != VP8_SEG_MAX)
- seg_iter++;
-
- *iter = (vpx_codec_iter_t)seg_iter;
-
- if (seg_iter->id != VP8_SEG_MAX)
- {
- mmap->id = seg_iter->id;
- mmap->sz = seg_iter->sz;
- mmap->align = seg_iter->align;
- mmap->flags = seg_iter->flags;
-
- if (!seg_iter->sz)
- mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
-
- res = VPX_CODEC_OK;
- }
- else
- res = VPX_CODEC_LIST_END;
- }
- while (!mmap->sz && res != VPX_CODEC_LIST_END);
-
- return res;
-}
-
-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
- const vpx_codec_mmap_t *mmap)
-{
- vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
- int i, done;
-
- if (!ctx->priv)
- {
- if (mmap->id == VP8_SEG_ALG_PRIV)
- {
- if (!ctx->priv)
- {
- vp8_init_ctx(ctx, mmap);
- res = VPX_CODEC_OK;
- }
- }
- }
-
- done = 1;
-
- if (!res && ctx->priv->alg_priv)
- {
- for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
- {
- if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
- if (!ctx->priv->alg_priv->mmaps[i].base)
- {
- ctx->priv->alg_priv->mmaps[i] = *mmap;
- res = VPX_CODEC_OK;
- }
-
- done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
- }
- }
-
- if (done && !res)
- {
- vp8_finalize_mmaps(ctx->priv->alg_priv);
- res = ctx->iface->init(ctx, NULL);
- }
-
- return res;
-}
-
static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
YV12_BUFFER_CONFIG *yv12)
{
@@ -877,7 +726,7 @@ static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx,
int ctrl_id,
va_list args)
{
- vp8_decrypt_init *init = va_arg(args, vp8_decrypt_init *);
+ vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
if (init)
{
@@ -904,7 +753,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
{VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
{VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
{VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame},
- {VP8D_SET_DECRYPTOR, vp8_set_decryptor},
+ {VPXD_SET_DECRYPTOR, vp8_set_decryptor},
{ -1, NULL},
};
@@ -922,13 +771,14 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
vp8_init, /* vpx_codec_init_fn_t init; */
vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
- vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
{
vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
vp8_decode, /* vpx_codec_decode_fn_t decode; */
vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ NOT_IMPLEMENTED,
},
{ /* encoder functions */
NOT_IMPLEMENTED,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
index cd091f39ae3..607382b4c40 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
@@ -50,7 +50,6 @@ VP8_CX_SRCS-yes += encoder/mcomp.h
VP8_CX_SRCS-yes += encoder/modecosts.h
VP8_CX_SRCS-yes += encoder/onyx_int.h
VP8_CX_SRCS-yes += encoder/pickinter.h
-VP8_CX_SRCS-yes += encoder/psnr.h
VP8_CX_SRCS-yes += encoder/quantize.h
VP8_CX_SRCS-yes += encoder/ratectrl.h
VP8_CX_SRCS-yes += encoder/rdopt.h
@@ -61,7 +60,6 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
VP8_CX_SRCS-yes += encoder/onyx_if.c
VP8_CX_SRCS-yes += encoder/pickinter.c
VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/psnr.c
VP8_CX_SRCS-yes += encoder/quantize.c
VP8_CX_SRCS-yes += encoder/ratectrl.c
VP8_CX_SRCS-yes += encoder/rdopt.c
@@ -90,6 +88,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -98,7 +97,6 @@ endif
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
index b030ee57e2a..57330486380 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
@@ -35,10 +35,12 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
# encoder
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/subtract_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
+
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8dx.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8dx.mk
index 4a8f46708ed..892ed70f52b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8dx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8dx.mk
@@ -22,7 +22,7 @@ VP8_DX_SRCS-yes += vp8_dx_iface.c
VP8_DX_SRCS-yes += decoder/dboolhuff.c
VP8_DX_SRCS-yes += decoder/decodemv.c
-VP8_DX_SRCS-yes += decoder/decodframe.c
+VP8_DX_SRCS-yes += decoder/decodeframe.c
VP8_DX_SRCS-yes += decoder/detokenize.c
VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h