diff options
author | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-08 14:30:41 +0200 |
---|---|---|
committer | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-12 13:49:54 +0200 |
commit | ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch) | |
tree | 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon | |
parent | 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff) |
Update Chromium to beta version 37.0.2062.68
Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca
Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon')
42 files changed, 4253 insertions, 4632 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm deleted file mode 100644 index e392786d43d..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_bilinear_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, bifilter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - vst1.u8 {d4, d5}, [r4], r5 - vst1.u8 {d6, d7}, [r4], r5 - vmov q11, q15 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_sp16x16_loop_neon - - add sp, sp, #272 - - pop {r4-r5,pc} - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r4], r5 ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r4], r5 - vst1.u8 {d18, d19}, [r4], r5 - vst1.u8 {d20, d21}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - pop {r4-r5,pc} - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r4], r5 - vmov q11, q15 - vst1.u8 {d6, d7}, [r4], r5 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_spo16x16_loop_neon - pop {r4-r5,pc} - - ENDP - -;----------------- - -bifilter16_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm deleted file mode 100644 index 0ac62436f97..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm +++ /dev/null @@ -1,130 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict4x4_neon| PROC - push {r4, lr} - - adr r12, bifilter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x4) - vld1.u8 {d2}, [r0], r1 ;load src data - add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) - - vld1.u8 {d3}, [r0], r1 - vld1.u32 {d31}, [r2] ;first_pass filter - - vld1.u8 {d4}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0-d1) - vld1.u8 {d5}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {d6}, [r0], r1 - - vshr.u64 q4, q1, #8 ;construct src_ptr[1] - vshr.u64 q5, q2, #8 - vshr.u64 d12, d6, #8 - - vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d4, d5 - vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - - vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q8, d10, d1 - vmlal.u8 q9, d12, d1 - - vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d29, q8, #7 - vqrshrn.u16 d30, q9, #7 - -;Second pass: 4x4 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 ;calculate Vfilter location - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d31[4] - - vmull.u8 q1, d28, d0 - vmull.u8 q2, d29, d0 - - vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] - vext.8 d27, d29, d30, #4 - - vmlal.u8 q1, d26, d1 - vmlal.u8 q2, d27, d1 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - - vst1.32 {d2[0]}, [r4] ;store result - vst1.32 {d2[1]}, [r0] - vst1.32 {d3[0]}, [r1] - vst1.32 {d3[1]}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - - vld1.32 {d28[0]}, [r0], r1 ;load src data - vld1.32 {d28[1]}, [r0], r1 - vld1.32 {d29[0]}, [r0], r1 - vld1.32 {d29[1]}, [r0], r1 - vld1.32 {d30[0]}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.32 {d28[0]}, [r4], lr ;store result - vst1.32 {d28[1]}, [r4], lr - vst1.32 {d29[0]}, [r4], lr - vst1.32 {d29[1]}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm deleted file mode 100644 index 41f5c45ffe7..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm +++ /dev/null @@ -1,135 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x4_neon| PROC - push {r4, lr} - - adr r12, bifilter8x4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {q5}, [r0], r1 - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q7, #7 - vqrshrn.u16 d24, q8, #7 - vqrshrn.u16 d25, q9, #7 - vqrshrn.u16 d26, q10, #7 - -;Second pass: 4x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1] - vst1.u8 {d5}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8x4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm deleted file mode 100644 index c4711bc4d4a..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x8_neon| PROC - push {r4, lr} - - adr r12, bifilter8_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1], lr - vst1.u8 {d5}, [r1], lr - vst1.u8 {d6}, [r1], lr - vst1.u8 {d7}, [r1], lr - vst1.u8 {d8}, [r1], lr - vst1.u8 {d9}, [r1], lr - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - vst1.u8 {d26}, [r4], lr - vst1.u8 {d27}, [r4], lr - vst1.u8 {d28}, [r4], lr - vst1.u8 {d29}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 00000000000..e1c3c2be7df --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const uint16_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8; + uint32x2_t d28u32, d29u32, d30u32; + uint8x16_t q1u8, q2u8; + uint16x8_t q1u16, q2u16; + uint16x8_t q7u16, q8u16, q9u16; + uint64x2_t q4u64, q5u64; + uint64x1_t d12u64; + uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2; + + if (xoffset == 0) { // skip_1stpass_filter + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0); + src_ptr += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1); + src_ptr += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + } else { + d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d6u8 = vld1_u8(src_ptr); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); + q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); + d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), + vreinterpret_u32_u8(vget_high_u8(q1u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), + vreinterpret_u32_u8(vget_high_u8(q2u8))); + d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), + vreinterpret_u32_u64(vget_high_u64(q4u64))); + d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), + vreinterpret_u32_u64(vget_high_u64(q5u64))); + + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q9u16 = vmull_u8(d6u8, d0u8); + + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8); + q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8); + + d28u8 = vqrshrn_n_u16(q7u16, 7); + d29u8 = vqrshrn_n_u16(q8u16, 7); + d30u8 = vqrshrn_n_u16(q9u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d28u8, d0u8); + q2u16 = vmull_u8(d29u8, d0u8); + + d26u8 = vext_u8(d28u8, d29u8, 4); + d27u8 = vext_u8(d29u8, d30u8, 4); + + q1u16 = vmlal_u8(q1u16, d26u8, d1u8); + q2u16 = vmlal_u8(q2u16, d27u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + } + return; +} + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm index e3ea91fe6c0..a8730aa04ef 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm @@ -26,6 +26,7 @@ |vp8_build_intra_predictors_mby_neon_func| PROC push {r4-r8, lr} + vpush {d8-d15} cmp r3, #0 beq case_dc_pred @@ -37,8 +38,8 @@ beq case_tm_pred case_dc_pred - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left + ldr r4, [sp, #88] ; Up + ldr r5, [sp, #92] ; Left ; Default the DC average to 128 mov r12, #128 @@ -143,6 +144,7 @@ skip_dc_pred_up_left vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_v_pred ; Copy down above row @@ -165,6 +167,7 @@ case_v_pred vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_h_pred @@ -224,6 +227,7 @@ case_h_pred vst1.u8 {q2}, [r1]! vst1.u8 {q3}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_tm_pred @@ -293,6 +297,7 @@ case_tm_pred_loop subs r12, r12, #1 bne case_tm_pred_loop + vpop {d8-d15} pop {r4-r8,pc} ENDP @@ -307,6 +312,7 @@ case_tm_pred_loop |vp8_build_intra_predictors_mby_s_neon_func| PROC push {r4-r8, lr} + vpush {d8-d15} mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; @@ -320,8 +326,8 @@ case_tm_pred_loop beq case_tm_pred_s case_dc_pred_s - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left + ldr r4, [sp, #88] ; Up + ldr r5, [sp, #92] ; Left ; Default the DC average to 128 mov r12, #128 @@ -426,6 +432,7 @@ skip_dc_pred_up_left_s vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 + vpop {d8-d15} pop {r4-r8,pc} case_v_pred_s ; Copy down above row @@ -448,6 +455,8 @@ case_v_pred_s vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 + + vpop {d8-d15} pop {r4-r8,pc} case_h_pred_s @@ -507,6 +516,7 @@ case_h_pred_s vst1.u8 {q2}, [r1], r2 vst1.u8 {q3}, [r1], r2 + vpop {d8-d15} pop {r4-r8,pc} case_tm_pred_s @@ -576,6 +586,7 @@ case_tm_pred_loop_s subs r12, r12, #1 bne case_tm_pred_loop_s + vpop {d8-d15} pop {r4-r8,pc} ENDP diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm deleted file mode 100644 index bda4b965442..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem16x16_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem16x16_neon| PROC - - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vld1.u8 {q2}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vld1.u8 {q3}, [r0], r1 - vst1.u8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vst1.u8 {q2}, [r2], r3 - vld1.u8 {q5}, [r0], r1 - vst1.u8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vst1.u8 {q4}, [r2], r3 - vld1.u8 {q7}, [r0], r1 - vst1.u8 {q5}, [r2], r3 - vld1.u8 {q8}, [r0], r1 - vst1.u8 {q6}, [r2], r3 - vld1.u8 {q9}, [r0], r1 - vst1.u8 {q7}, [r2], r3 - vld1.u8 {q10}, [r0], r1 - vst1.u8 {q8}, [r2], r3 - vld1.u8 {q11}, [r0], r1 - vst1.u8 {q9}, [r2], r3 - vld1.u8 {q12}, [r0], r1 - vst1.u8 {q10}, [r2], r3 - vld1.u8 {q13}, [r0], r1 - vst1.u8 {q11}, [r2], r3 - vld1.u8 {q14}, [r0], r1 - vst1.u8 {q12}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - vst1.u8 {q13}, [r2], r3 - vst1.u8 {q14}, [r2], r3 - vst1.u8 {q15}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem16x16_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm deleted file mode 100644 index 35c0f6708a5..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x4_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x4_neon| PROC - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vst1.u8 {d3}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x4_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm deleted file mode 100644 index 1f5b9411bb5..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm +++ /dev/null @@ -1,43 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x8_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x8_neon| PROC - - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vld1.u8 {d4}, [r0], r1 - vst1.u8 {d3}, [r2], r3 - vld1.u8 {d5}, [r0], r1 - vst1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r0], r1 - vst1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r0], r1 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x8_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 00000000000..deced115c14 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm deleted file mode 100644 index 79ff02c6940..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, -; int pred_stride, unsigned char *dst_ptr, -; int dst_stride) - -; r0 input_dc -; r1 pred_ptr -; r2 pred_stride -; r3 dst_ptr -; sp dst_stride - -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r3], r12 - vst1.32 {d2[1]}, [r3], r12 - vst1.32 {d4[0]}, [r3], r12 - vst1.32 {d4[1]}, [r3] - - bx lr - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 00000000000..ad5f41d7dee --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm deleted file mode 100644 index 602cce67697..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_add_neon(short *input, short *dq, -; unsigned char *dest, int stride) -; r0 short *input, -; r1 short *dq, -; r2 unsigned char *dest -; r3 int stride - -|vp8_dequant_idct_add_neon| PROC - vld1.16 {q3, q4}, [r0] - vld1.16 {q5, q6}, [r1] - - add r1, r2, r3 ; r1 = dest + stride - lsl r3, #1 ; 2x stride - - vld1.32 {d14[0]}, [r2], r3 - vld1.32 {d14[1]}, [r1], r3 - vld1.32 {d15[0]}, [r2] - vld1.32 {d15[1]}, [r1] - - adr r12, cospi8sqrt2minus1 ; pointer to the first constant - - vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon - vmul.i16 q2, q4, q6 - -;|short_idct4x4llm_neon| PROC - vld1.16 {d0}, [r12] - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - -; memset(input, 0, 32) -- 32bytes - vmov.i16 q14, #0 - - vswp d3, d4 - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vmov q15, q14 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vst1.16 {q14, q15}, [r0] - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vaddw.u8 q1, q1, d14 - vaddw.u8 q2, q2, d15 - - sub r2, r2, r3 - sub r1, r1, r3 - - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - - vst1.32 {d0[0]}, [r2], r3 - vst1.32 {d0[1]}, [r1], r3 - vst1.32 {d1[0]}, [r2] - vst1.32 {d1[1]}, [r1] - - bx lr - - ENDP ; |vp8_dequant_idct_add_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b4e7b -sinpi8sqrt2 DCD 0x8a8c8a8c - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 00000000000..58e11922c76 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm deleted file mode 100644 index c8e0c31f29c..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_neon| PROC - vld1.16 {q0, q1}, [r0] - vld1.16 {q2, q3}, [r1] - - vmul.i16 q4, q0, q2 - vmul.i16 q5, q1, q3 - - vst1.16 {q4, q5}, [r2] - - bx lr - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 00000000000..54e709dd3c3 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm index 6c29c55860d..3a3921081c4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm @@ -22,6 +22,7 @@ ; r3 stride |idct_dequant_0_2x_neon| PROC push {r4, r5} + vpush {d8-d15} add r12, r2, #4 vld1.32 {d2[0]}, [r2], r3 @@ -72,6 +73,7 @@ vst1.32 {d4[1]}, [r2] vst1.32 {d10[1]}, [r0] + vpop {d8-d15} pop {r4, r5} bx lr diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm index d5dce63f6bd..8da0fa0b7ea 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm @@ -22,6 +22,8 @@ ; r2 *dst ; r3 stride |idct_dequant_full_2x_neon| PROC + vpush {d8-d15} + vld1.16 {q0, q1}, [r1] ; dq (same l/r) vld1.16 {q2, q3}, [r0] ; l q add r0, r0, #32 @@ -184,6 +186,7 @@ vst1.32 {d3[0]}, [r2] vst1.32 {d3[1]}, [r1] + vpop {d8-d15} bx lr ENDP ; |idct_dequant_full_2x_neon| diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm deleted file mode 100644 index e8ea2a61976..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm +++ /dev/null @@ -1,87 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - EXPORT |vp8_short_inv_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) -|vp8_short_inv_walsh4x4_neon| PROC - - ; read in all four lines of values: d0->d3 - vld1.i16 {q0-q1}, [r0@128] - - ; first for loop - vadd.s16 d4, d0, d3 ;a = [0] + [12] - vadd.s16 d6, d1, d2 ;b = [4] + [8] - vsub.s16 d5, d0, d3 ;d = [0] - [12] - vsub.s16 d7, d1, d2 ;c = [4] - [8] - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vtrn.32 d0, d2 ;d0: 0 1 8 9 - ;d2: 2 3 10 11 - vtrn.32 d1, d3 ;d1: 4 5 12 13 - ;d3: 6 7 14 15 - - vtrn.16 d0, d1 ;d0: 0 4 8 12 - ;d1: 1 5 9 13 - vtrn.16 d2, d3 ;d2: 2 6 10 14 - ;d3: 3 7 11 15 - - ; second for loop - - vadd.s16 d4, d0, d3 ;a = [0] + [3] - vadd.s16 d6, d1, d2 ;b = [1] + [2] - vsub.s16 d5, d0, d3 ;d = [0] - [3] - vsub.s16 d7, d1, d2 ;c = [1] - [2] - - vmov.i16 q8, #3 - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vadd.i16 q0, q0, q8 ;e/f += 3 - vadd.i16 q1, q1, q8 ;g/h += 3 - - vshr.s16 q0, q0, #3 ;e/f >> 3 - vshr.s16 q1, q1, #3 ;g/h >> 3 - - mov r2, #64 - add r3, r1, #32 - - vst1.i16 d0[0], [r1],r2 - vst1.i16 d1[0], [r3],r2 - vst1.i16 d2[0], [r1],r2 - vst1.i16 d3[0], [r3],r2 - - vst1.i16 d0[1], [r1],r2 - vst1.i16 d1[1], [r3],r2 - vst1.i16 d2[1], [r1],r2 - vst1.i16 d3[1], [r3],r2 - - vst1.i16 d0[2], [r1],r2 - vst1.i16 d1[2], [r3],r2 - vst1.i16 d2[2], [r1],r2 - vst1.i16 d3[2], [r3],r2 - - vst1.i16 d0[3], [r1],r2 - vst1.i16 d1[3], [r3],r2 - vst1.i16 d2[3], [r1] - vst1.i16 d3[3], [r3] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 00000000000..6ea9dd712aa --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm index e44be0a1e34..c4f09c7753b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm @@ -24,10 +24,12 @@ ; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #4] ; load thresh + ldr r3, [sp, #68] ; load thresh add r12, r2, r1 add r1, r1, r1 @@ -52,6 +54,7 @@ vst1.u8 {q7}, [r2@128], r1 ; store oq0 vst1.u8 {q8}, [r12@128], r1 ; store oq1 + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| @@ -64,10 +67,12 @@ ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #4] ; load thresh - ldr r2, [sp, #8] ; load v ptr + ldr r12, [sp, #68] ; load thresh + ldr r2, [sp, #72] ; load v ptr vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines @@ -104,6 +109,7 @@ vst1.u8 {d16}, [r0@64] ; store u oq1 vst1.u8 {d17}, [r2@64] ; store v oq1 + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| @@ -120,11 +126,13 @@ |vp8_loop_filter_vertical_edge_y_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit sub r2, r0, #4 ; src ptr down by 4 columns add r1, r1, r1 - ldr r3, [sp, #4] ; load thresh + ldr r3, [sp, #68] ; load thresh add r12, r2, r1, asr #1 vld1.u8 {d6}, [r2], r1 @@ -194,6 +202,7 @@ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| @@ -210,9 +219,11 @@ ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #8] ; load v ptr + ldr r2, [sp, #72] ; load v ptr vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns @@ -233,7 +244,7 @@ vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh + ldr r12, [sp, #68] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -281,6 +292,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm deleted file mode 100644 index adf848b9c34..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ /dev/null @@ -1,117 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| - EXPORT |vp8_loop_filter_bhs_neon| - EXPORT |vp8_loop_filter_mbhs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp8_loop_filter_simple_horizontal_edge_neon| PROC - - sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines - - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q5}, [r3@128], r1 ; p0 - vld1.u8 {q8}, [r0@128] ; q1 - vld1.u8 {q6}, [r3@128] ; p1 - - vabd.u8 q15, q6, q7 ; abs(p0 - q0) - vabd.u8 q14, q5, q8 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q13, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value - veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value - veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value - veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q3, d15, d13 - - vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) - vmul.s16 q3, q3, q13 - - vmov.u8 q10, #0x03 ; 0x03 - vmov.u8 q9, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q3, q3, d9 - - vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d9, q3 - - vand q14, q4, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - - sub r0, r0, r1 - - ;calculate output - vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - - vst1.u8 {q6}, [r3@128] ; store op0 - vst1.u8 {q7}, [r0@128] ; store oq0 - - bx lr - ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_bhs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate blim - - add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride - bl vp8_loop_filter_simple_horizontal_edge_neon - ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 - add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride - bl vp8_loop_filter_simple_horizontal_edge_neon - add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride - pop {r4, lr} - b vp8_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp8_loop_filter_bhs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_mbhs_neon| PROC - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp8_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp8_loop_filter_bhs_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 00000000000..b25686ffb88 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), + vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index e690df2f7de..78d13c895aa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,7 +9,6 @@ ; - ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| EXPORT |vp8_loop_filter_bvs_neon| EXPORT |vp8_loop_filter_mbvs_neon| ARM @@ -22,6 +21,8 @@ ; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC + vpush {d8-d15} + sub r0, r0, #2 ; move src pointer down by 2 columns add r12, r1, r1 add r3, r0, r1 @@ -120,6 +121,7 @@ vst2.8 {d14[6], d15[6]}, [r0], r12 vst2.8 {d14[7], d15[7]}, [r3] + vpop {d8-d15} bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm deleted file mode 100644 index f41c156df8b..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm +++ /dev/null @@ -1,469 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| - EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_horizontal_edge_y_neon| PROC - push {lr} - add r1, r1, r1 ; double stride - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - add r12, r0, r1, lsr #1 ; move src pointer up by 1 line - - vld1.u8 {q3}, [r0@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r0@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r0@128], r1 ; q2 - vld1.u8 {q10}, [r12@128], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #2 - add r0, r12, r1, lsr #1 - - vst1.u8 {q4}, [r12@128],r1 ; store op2 - vst1.u8 {q5}, [r0@128],r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r0@128],r1 ; store oq0 - vst1.u8 {q8}, [r12@128] ; store oq1 - vst1.u8 {q9}, [r0@128] ; store oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| - -; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v - -|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r0@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r0@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r0@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r0@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r0@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r0@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r0@64], r1 ; q3 - vld1.u8 {d21}, [r12@64], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r0, r0, r1, lsl #3 - sub r12, r12, r1, lsl #3 - - add r0, r0, r1 - add r12, r12, r1 - - vst1.u8 {d8}, [r0@64], r1 ; store u op2 - vst1.u8 {d9}, [r12@64], r1 ; store v op2 - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r12@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r12@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r12@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64], r1 ; store u oq1 - vst1.u8 {d17}, [r12@64], r1 ; store v oq1 - vst1.u8 {d18}, [r0@64], r1 ; store u oq2 - vst1.u8 {d19}, [r12@64], r1 ; store v oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| - -; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_vertical_edge_y_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move src pointer down by 4 columns - vdup.s8 q2, r12 ; thresh - add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - vld1.u8 {d7}, [r12], r1 ; load second 8-line src data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| - -; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 unsigned char *v -|vp8_mbloop_filter_vertical_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move u pointer down by 4 columns - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r12], r1 ;load v data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| - -; void vp8_mbloop_filter_neon() -; This is a helper function for the macroblock loopfilters. The individual -; functions do the necessary load, transpose (if necessary), preserve (if -; necessary) and store. - -; r0,r1 PRESERVE -; r2 mblimit -; r3 limit - -; q2 thresh -; q3 p3 PRESERVE -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 PRESERVE - -|vp8_mbloop_filter_neon| PROC - - ; vp8_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q1, q9, q8 ; abs(q2 - q1) - vabd.u8 q0, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q1, q1, q0 - vmax.u8 q15, q11, q12 - - vabd.u8 q12, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q1 - - vdup.u8 q1, r3 ; limit - vdup.u8 q2, r2 ; mblimit - - vmov.u8 q0, #0x80 ; 0x80 - - vcge.u8 q15, q1, q15 - - vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vmov.u16 q11, #3 ; #3 - - ; vp8_filter - ; convert to signed - veor q7, q7, q0 ; qs0 - vshr.u8 q1, q1, #1 ; a = a / 2 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - - vqadd.u8 q12, q12, q1 ; a = b + a - - veor q8, q8, q0 ; qs1 - veor q4, q4, q0 ; ps2 - veor q9, q9, q0 ; qs2 - - vorr q14, q13, q14 ; vp8_hevmask - - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 - - vsubl.s8 q2, d14, d12 ; qs0 - ps0 - vsubl.s8 q13, d15, d13 - - vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - - vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) - - vand q15, q15, q12 ; vp8_filter_mask - - vmul.i16 q13, q13, q11 - - vmov.u8 q12, #3 ; #3 - - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 - - vmov.u8 q11, #4 ; #4 - - ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q13 - - vand q1, q1, q15 ; vp8_filter &= mask - - vmov.u16 q15, #63 ; #63 - - vand q13, q1, q14 ; Filter2 &= hev - - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) - vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - - vmov q0, q15 - - vshr.s8 q2, q2, #3 ; Filter1 >>= 3 - vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - - vmov q11, q15 - vmov q12, q15 - - vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) - - vbic q1, q1, q14 ; vp8_filter &= ~hev - - ; roughly 1/7th difference across boundary - ; roughly 2/7th difference across boundary - ; roughly 3/7th difference across boundary - - vmov.u8 d5, #9 ; #9 - vmov.u8 d4, #18 ; #18 - - vmov q13, q15 - vmov q14, q15 - - vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 - vmlal.s8 q11, d3, d5 - vmov.u8 d5, #27 ; #27 - vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 - vmlal.s8 q13, d3, d4 - vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 - vmlal.s8 q15, d3, d5 - - vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d1, q11, #7 - vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) - vqshrn.s16 d25, q13, #7 - vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) - vqshrn.s16 d29, q15, #7 - - vmov.u8 q1, #0x80 ; 0x80 - - vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) - vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) - vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) - vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) - vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) - vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - - veor q9, q11, q1 ; *oq2 = s^0x80 - veor q4, q0, q1 ; *op2 = s^0x80 - veor q8, q13, q1 ; *oq1 = s^0x80 - veor q5, q12, q1 ; *op2 = s^0x80 - veor q7, q15, q1 ; *oq0 = s^0x80 - veor q6, q14, q1 ; *op0 = s^0x80 - - bx lr - ENDP ; |vp8_mbloop_filter_neon| - -;----------------- - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 00000000000..5351f4be665 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_mbloop_filter_neon( + uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16 , 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm deleted file mode 100644 index d7c590e15a2..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad16_neon.asm +++ /dev/null @@ -1,207 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_neon| - EXPORT |vp8_sad16x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int src_stride -; r2 unsigned char *ref_ptr -; r3 int ref_stride -|vp8_sad16x16_neon| PROC -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0] - vld1.8 {q7}, [r2] - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================== -;unsigned int vp8_sad16x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -|vp8_sad16x8_neon| PROC - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm deleted file mode 100644 index 23ba6df93a4..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad8_neon.asm +++ /dev/null @@ -1,209 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad8x8_neon| - EXPORT |vp8_sad8x16_neon| - EXPORT |vp8_sad4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; unsigned int vp8_sad8x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x8_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================ -;unsigned int vp8_sad8x16_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x16_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;=========================== -;unsigned int vp8_sad4x4_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad4x4_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 d1, d24 - vpaddl.u32 d0, d1 - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c new file mode 100644 index 00000000000..6595ac0519b --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +unsigned int vp8_sad8x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 7; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad8x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad4x4_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int vp8_sad16x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 15; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad16x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm deleted file mode 100644 index fd7002e7a9e..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/save_reg_neon.asm +++ /dev/null @@ -1,36 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_push_neon| - EXPORT |vp8_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|vp8_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|vp8_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm deleted file mode 100644 index 67d2ab0150d..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm +++ /dev/null @@ -1,139 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;************************************************************* -;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, -; unsigned char *dst, int stride) -;r0 short * input -;r1 short * pred -;r2 int pitch -;r3 unsigned char dst -;sp int stride -;************************************************************* - -; static const int cospi8sqrt2minus1=20091; -; static const int sinpi8sqrt2 =35468; -; static const int rounding = 0; - -; Optimization note: The resulted data from dequantization are signed -; 13-bit data that is in the range of [-4096, 4095]. This allows to -; use "vqdmulh"(neon) instruction since it won't go out of range -; (13+16+1=30bits<32bits). This instruction gives the high half -; result of the multiplication that is needed in IDCT. - -|vp8_short_idct4x4llm_neon| PROC - adr r12, idct_coeff - vld1.16 {q1, q2}, [r0] - vld1.16 {d0}, [r12] - - vswp d3, d4 ;q2(vp[4] vp[12]) - ldr r0, [sp] ; stride - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vswp d3, d4 - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - ; load prediction data - vld1.32 d6[0], [r1], r2 - vld1.32 d6[1], [r1], r2 - vld1.32 d7[0], [r1], r2 - vld1.32 d7[1], [r1], r2 - - ; add prediction and residual - vaddw.u8 q1, q1, d6 - vaddw.u8 q2, q2, d7 - - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - - ; store to destination - vst1.32 d1[0], [r3], r0 - vst1.32 d1[1], [r3], r0 - vst1.32 d2[0], [r3], r0 - vst1.32 d2[1], [r3], r0 - - bx lr - - ENDP - -;----------------- - -idct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c - -;20091, 20091, 35468, 35468 - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 00000000000..373afa6ed35 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_neon( + int16_t *input, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), + vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm deleted file mode 100644 index 9fdafd3609e..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm +++ /dev/null @@ -1,490 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter16_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to -; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, -; the result can be negtive. So, I treat the result as s16. But, since it is also possible -; that the result can be a large positive number (> 2^15-1), which could be confused as a -; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, -; which ensures that the result stays in s16 range. Finally, saturated add the result by -; applying 3rd filter coeff. Same applys to other filter functions. - -|vp8_sixtap_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, filter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter16x16_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter16x16_only - - sub sp, sp, #336 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #7 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (21x16) -filt_blk2d_fp16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d7, d0 - vmull.u8 q10, d9, d0 - vmull.u8 q11, d10, d0 - vmull.u8 q12, d12, d0 - vmull.u8 q13, d13, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d9, d10, #1 - vext.8 d30, d12, d13, #1 - - vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q12, d30, d1 - - vext.8 d28, d7, d8, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d13, d14, #1 - - vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q11, d29, d1 - vmlsl.u8 q13, d30, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d9, d10, #4 - vext.8 d30, d12, d13, #4 - - vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q12, d30, d4 - - vext.8 d28, d7, d8, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d13, d14, #4 - - vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q11, d29, d4 - vmlsl.u8 q13, d30, d4 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - vext.8 d30, d12, d13, #5 - - vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q10, d29, d5 - vmlal.u8 q12, d30, d5 - - vext.8 d28, d7, d8, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d13, d14, #5 - - vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q11, d29, d5 - vmlal.u8 q13, d30, d5 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d9, d10, #2 - vext.8 d30, d12, d13, #2 - - vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q10, d29, d2 - vmlal.u8 q12, d30, d2 - - vext.8 d28, d7, d8, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d13, d14, #2 - - vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q11, d29, d2 - vmlal.u8 q13, d30, d2 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d9, d10, #3 - vext.8 d30, d12, d13, #3 - - vext.8 d15, d7, d8, #3 - vext.8 d31, d10, d11, #3 - vext.8 d6, d13, d14, #3 - - vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - - vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) - vqadd.s16 q10, q5 - vqadd.s16 q12, q6 - - vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q7, d31, d3 - vmull.u8 q3, d6, d3 - - subs r2, r2, #1 - - vqadd.s16 q9, q6 - vqadd.s16 q11, q7 - vqadd.s16 q13, q3 - - vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q9, #7 - vqrshrun.s16 d8, q10, #7 - vqrshrun.s16 d9, q11, #7 - vqrshrun.s16 d10, q12, #7 - vqrshrun.s16 d11, q13, #7 - - vst1.u8 {d6, d7, d8}, [lr]! ;store result - vst1.u8 {d9, d10, d11}, [lr]! - - bne filt_blk2d_fp16x16_loop_neon - -;Second pass: 16x16 -;secondpass_filter - do first 8-columns and then second 8-columns - add r3, r12, r3, lsl #5 - sub lr, lr, #336 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - mov r2, #16 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp16x16_outloop_neon - vld1.u8 {d18}, [lr], r2 ;load src data - vld1.u8 {d19}, [lr], r2 - vld1.u8 {d20}, [lr], r2 - vld1.u8 {d21}, [lr], r2 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [lr], r2 - -secondpass_inner_loop_neon - vld1.u8 {d23}, [lr], r2 ;load src data - vld1.u8 {d24}, [lr], r2 - vld1.u8 {d25}, [lr], r2 - vld1.u8 {d26}, [lr], r2 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_inner_loop_neon - - subs r3, r3, #1 - sub lr, lr, #336 - add lr, lr, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_sp16x16_outloop_neon - - add sp, sp, #336 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter16x16_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #8 ;loop counter - sub r0, r0, #2 ;move srcptr back to (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - - pld [r0] - pld [r0, r1] - - vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q7, d7, d0 - vmull.u8 q8, d9, d0 - vmull.u8 q9, d10, d0 - - vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d21, d9, d10, #1 - vext.8 d22, d7, d8, #1 - vext.8 d23, d10, d11, #1 - vext.8 d24, d6, d7, #4 ;construct src_ptr[2] - vext.8 d25, d9, d10, #4 - vext.8 d26, d7, d8, #4 - vext.8 d27, d10, d11, #4 - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - - vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d21, d1 - vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d23, d1 - vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d25, d4 - vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d27, d4 - vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - - vext.8 d20, d7, d8, #5 - vext.8 d21, d10, d11, #5 - vext.8 d22, d6, d7, #2 ;construct src_ptr[0] - vext.8 d23, d9, d10, #2 - vext.8 d24, d7, d8, #2 - vext.8 d25, d10, d11, #2 - - vext.8 d26, d6, d7, #3 ;construct src_ptr[1] - vext.8 d27, d9, d10, #3 - vext.8 d28, d7, d8, #3 - vext.8 d29, d10, d11, #3 - - vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d21, d5 - vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d23, d2 - vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d25, d2 - - vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q11, d27, d3 - vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q15, d29, d3 - - vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q11 - vqadd.s16 q7, q12 - vqadd.s16 q9, q15 - - subs r2, r2, #1 - - vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q7, #7 - vqrshrun.s16 d8, q8, #7 - vqrshrun.s16 d9, q9, #7 - - vst1.u8 {q3}, [r4], r5 ;store result - vst1.u8 {q4}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - - pop {r4-r5,pc} - -;-------------------- -secondpass_filter16x16_only -;Second pass: 16x16 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_spo16x16_outloop_neon - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r0], r1 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [r0], r1 - -secondpass_only_inner_loop_neon - vld1.u8 {d23}, [r0], r1 ;load src data - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_only_inner_loop_neon - - subs r3, r3, #1 - sub r0, r0, r1, lsl #4 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1 - add r0, r0, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_spo16x16_outloop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm deleted file mode 100644 index a4222bc62c5..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm +++ /dev/null @@ -1,422 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter4_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_sixtap_predict4x4_neon| PROC - push {r4, lr} - - adr r12, filter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter4x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter4x4_only - - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - sub r0, r0, r1, lsl #1 ;go back 2 lines of src data - -;First pass: output_height lines x output_width columns (9x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - - vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data - vld1.u8 {q4}, [r0], r1 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - ;First Pass on rest 5-line data - vld1.u8 {q11}, [r0], r1 - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vext.8 d31, d22, d23, #5 ;construct src_ptr[3] - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] - - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vext.8 d31, d22, d23, #4 ;construct src_ptr[2] - - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vext.8 d31, d22, d23, #2 ;construct src_ptr[0] - - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vext.8 d31, d22, d23, #3 ;construct src_ptr[1] - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) - - add r3, r12, r3, lsl #5 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - vqadd.s16 q12, q11 - - vext.8 d23, d27, d28, #4 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d30, q8, #7 - vqrshrun.s16 d31, q12, #7 - -;Second pass: 4x4 - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -firstpass_filter4x4_only - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - -;First pass: output_height lines x output_width columns (4x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - vst1.32 {d27[0]}, [r4] ;store result - vst1.32 {d27[1]}, [r0] - vst1.32 {d28[0]}, [r1] - vst1.32 {d28[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -secondpass_filter4x4_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.32 {d27[0]}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.32 {d27[1]}, [r0], r1 - vabs.s32 q7, q5 - vld1.32 {d28[0]}, [r0], r1 - vabs.s32 q8, q6 - vld1.32 {d28[1]}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.32 {d29[0]}, [r0], r1 - vdup.8 d1, d14[4] - vld1.32 {d29[1]}, [r0], r1 - vdup.8 d2, d15[0] - vld1.32 {d30[0]}, [r0], r1 - vdup.8 d3, d15[4] - vld1.32 {d30[1]}, [r0], r1 - vdup.8 d4, d16[0] - vld1.32 {d31[0]}, [r0], r1 - vdup.8 d5, d16[4] - - vext.8 d23, d27, d28, #4 - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - ENDP - -;----------------- - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm deleted file mode 100644 index a57ec015f2c..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm +++ /dev/null @@ -1,473 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x4_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x4_only - - sub sp, sp, #32 ;reserve space on stack for temporary storage - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - mov lr, sp - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (9x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q7}, [r0], r1 - vst1.u8 {d25}, [lr]! - - ;first_pass filtering on the rest 5-line data - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d27, q9, #7 - vqrshrun.s16 d28, q10, #7 - vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack - vqrshrun.s16 d30, q12, #7 - -;Second pass: 8x4 -;secondpass_filter - add r3, r12, r3, lsl #5 - sub lr, lr, #32 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {q12}, [lr]! - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - add sp, sp, #32 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter8x4_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - vld1.u8 {q3}, [r0], r1 ;load src data - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (4x8) - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x4_only -;Second pass: 8x4 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {d22}, [r0], r1 - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d25}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d28}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d29}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d30}, [r0], r1 - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm deleted file mode 100644 index 00ed5aeefe3..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm +++ /dev/null @@ -1,524 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x8_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x8_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x8_only - - sub sp, sp, #64 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (13x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - -filt_blk2d_fp8x8_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - subs r2, r2, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d25}, [lr]! - - bne filt_blk2d_fp8x8_loop_neon - - ;first_pass filtering on the rest 5-line data - ;vld1.u8 {q3}, [r0], r1 ;load src data - ;vld1.u8 {q4}, [r0], r1 - ;vld1.u8 {q5}, [r0], r1 - ;vld1.u8 {q6}, [r0], r1 - vld1.u8 {q7}, [r0], r1 - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - add r3, r12, r3, lsl #5 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - sub lr, lr, #64 - vqrshrun.s16 d27, q9, #7 - vld1.u8 {q9}, [lr]! ;load intermediate data from stack - vqrshrun.s16 d28, q10, #7 - vld1.u8 {q10}, [lr]! - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q11, #7 - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vqrshrun.s16 d30, q12, #7 - vld1.u8 {q12}, [lr]! - -;Second pass: 8x8 - mov r3, #2 ;loop counter - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_sp8x8_loop_neon - - add sp, sp, #64 - pop {r4-r5,pc} - -;--------------------- -firstpass_filter8x8_only - ;add r2, r12, r2, lsl #5 ;calculate filter location - ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (8x8) -filt_blk2d_fpo8x8_loop_neon - vld1.u8 {q3}, [r0], r1 ;load src data - vld1.u8 {q4}, [r0], r1 - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - ; - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - subs r2, r2, #1 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - bne filt_blk2d_fpo8x8_loop_neon - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x8_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {d19}, [r0], r1 - vabs.s32 q7, q5 - vld1.u8 {d20}, [r0], r1 - vabs.s32 q8, q6 - vld1.u8 {d21}, [r0], r1 - mov r3, #2 ;loop counter - vld1.u8 {d22}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d23}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d24}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d25}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - -;Second pass: 8x8 -filt_blk2d_spo8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_spo8x8_loop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 00000000000..7a4d9e05128 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1752 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +static const int8_t vp8_sub_pel_filters[8][8] = { + {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ + {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -9, 93, 50, -6, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {0, -6, 50, 93, -9, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -1, 12, 123, -6, 0, 0, 0}, +}; + +void vp8_sixtap_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; + uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint32x2_t d27u32, d28u32, d29u32, d30u32, d31u32; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; + uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; + uint32x2x2_t d0u32x2, d1u32x2; + + if (xoffset == 0) { // secondpass_filter4x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); + src += src_pixels_per_line; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); + src += src_pixels_per_line; + d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); + + d27u8 = vreinterpret_u8_u32(d27u32); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + d31u8 = vreinterpret_u8_u32(d31u32); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + + // keep original src data in q4 q6 + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + + d27u8 = vqrshrun_n_s16(q7s16, 7); + d28u8 = vqrshrun_n_s16(q8s16, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + return; + } + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q11u8 = vld1q_u8(src); + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + q12u16 = vmull_u8(d31u8, d5u8); + + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + q11u16 = vmull_u8(d31u8, d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + q12s16 = vqaddq_s16(q12s16, q11s16); + + d29u8 = vqrshrun_n_s16(q7s16, 7); + d30u8 = vqrshrun_n_s16(q8s16, 7); + d31u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 4x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; +} + +void vp8_sixtap_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; +} + +void vp8_sixtap_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) + return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; +} + +void vp8_sixtap_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; i++) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + d14u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; i++) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm deleted file mode 100644 index e3b48327d3f..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.asm +++ /dev/null @@ -1,276 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_neon| - EXPORT |vp8_variance16x8_neon| - EXPORT |vp8_variance8x16_neon| - EXPORT |vp8_variance8x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - ;VPADAL adds adjacent pairs of elements of a vector, and accumulates - ;the results into the elements of the destination vector. The explanation - ;in ARM guide is wrong. - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - ;vmov.32 r0, d0[0] ;this instruction costs a lot - ;vmov.32 r1, d1[0] - ;mul r0, r0, r0 - ;str r1, [r12] - ;sub r0, r1, r0, lsr #8 - - ; while sum is signed, sum * sum is always positive and must be treated as - ; unsigned to avoid propagating the sign bit. - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================ -;unsigned int vp8_variance16x8_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) -|vp8_variance16x8_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #4 - -variance16x8_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================= -;unsigned int vp8_variance8x16_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) - -|vp8_variance8x16_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance8x16_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d2, d6 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - - bne variance8x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================== -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #2 - -variance8x8_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance8x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #6 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c new file mode 100644 index 00000000000..afd2dc3d1e2 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +unsigned int vp8_variance16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // variance16x8_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // variance8x16_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 2; i++) { // variance8x8_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm index 9d22c52521c..adc5b7e3a78 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -31,11 +31,12 @@ bilinear_taps_coeff |vp8_sub_pixel_variance16x16_neon_func| PROC push {r4-r6, lr} + vpush {d8-d15} adr r12, bilinear_taps_coeff - ldr r4, [sp, #16] ;load *dst_ptr from stack - ldr r5, [sp, #20] ;load dst_pixels_per_line from stack - ldr r6, [sp, #24] ;load *sse from stack + ldr r4, [sp, #80] ;load *dst_ptr from stack + ldr r5, [sp, #84] ;load dst_pixels_per_line from stack + ldr r6, [sp, #88] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_bfilter16x16_only @@ -416,6 +417,7 @@ sub_pixel_variance16x16_neon_loop add sp, sp, #528 vmov.32 r0, d0[0] ;return + vpop {d8-d15} pop {r4-r6,pc} ENDP diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm index 155be4fc54b..b0829af7547 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -31,9 +31,10 @@ ;================================================ |vp8_variance_halfpixvar16x16_h_neon| PROC push {lr} + vpush {d8-d15} mov r12, #4 ;loop counter - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -116,6 +117,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -131,11 +134,12 @@ vp8_filt_fpo16x16s_4_0_loop_neon ;================================================ |vp8_variance_halfpixvar16x16_v_neon| PROC push {lr} + vpush {d8-d15} mov r12, #4 ;loop counter vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse @@ -212,6 +216,8 @@ vp8_filt_spo16x16s_0_4_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -227,10 +233,11 @@ vp8_filt_spo16x16s_0_4_loop_neon ;================================================ |vp8_variance_halfpixvar16x16_hv_neon| PROC push {lr} + vpush {d8-d15} vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q13, #0 ;q8 - sum vext.8 q1, q0, q1, #1 ;construct src_ptr[1] @@ -331,6 +338,8 @@ vp8_filt16x16s_4_4_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -349,10 +358,11 @@ vp8_filt16x16s_4_4_loop_neon |vp8_sub_pixel_variance16x16s_neon| PROC push {r4, lr} + vpush {d8-d15} - ldr r4, [sp, #8] ;load *dst_ptr from stack - ldr r12, [sp, #12] ;load dst_pixels_per_line from stack - ldr lr, [sp, #16] ;load *sse from stack + ldr r4, [sp, #72] ;load *dst_ptr from stack + ldr r12, [sp, #76] ;load dst_pixels_per_line from stack + ldr lr, [sp, #80] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_bfilter16x16s_only @@ -566,6 +576,7 @@ sub_pixel_variance16x16s_neon_loop add sp, sp, #256 vmov.32 r0, d0[0] ;return + vpop {d8-d15} pop {r4, pc} ENDP diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm index f6b6847537f..9d9f9e0772a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -26,11 +26,12 @@ |vp8_sub_pixel_variance8x8_neon| PROC push {r4-r5, lr} + vpush {d8-d15} adr r12, bilinear_taps_coeff - ldr r4, [sp, #12] ;load *dst_ptr from stack - ldr r5, [sp, #16] ;load dst_pixels_per_line from stack - ldr lr, [sp, #20] ;load *sse from stack + ldr r4, [sp, #76] ;load *dst_ptr from stack + ldr r5, [sp, #80] ;load dst_pixels_per_line from stack + ldr lr, [sp, #84] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq skip_firstpass_filter @@ -210,6 +211,8 @@ sub_pixel_variance8x8_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {r4-r5, pc} ENDP |