diff options
Diffstat (limited to 'chromium/third_party/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S')
-rw-r--r-- | chromium/third_party/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S | 655 |
1 files changed, 655 insertions, 0 deletions
diff --git a/chromium/third_party/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S b/chromium/third_party/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S new file mode 100644 index 00000000000..8355cdf3a2b --- /dev/null +++ b/chromium/third_party/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2014 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define MAX_CHANNELS 8 +#define MAX_FIR_ORDER 8 +#define MAX_IIR_ORDER 4 +#define MAX_RATEFACTOR 4 +#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR) + +PST .req a1 +PCO .req a2 +AC0 .req a3 +AC1 .req a4 +CO0 .req v1 +CO1 .req v2 +CO2 .req v3 +CO3 .req v4 +ST0 .req v5 +ST1 .req v6 +ST2 .req sl +ST3 .req fp +I .req ip +PSAMP .req lr + + +// Some macros that do loads/multiplies where the register number is determined +// from an assembly-time expression. Boy is GNU assembler's syntax ugly... + +.macro load group, index, base, offset + .altmacro + load_ \group, %(\index), \base, \offset + .noaltmacro +.endm + +.macro load_ group, index, base, offset + ldr \group\index, [\base, #\offset] +.endm + +.macro loadd group, index, base, offset + .altmacro + loadd_ \group, %(\index), %(\index+1), \base, \offset + .noaltmacro +.endm + +.macro loadd_ group, index0, index1, base, offset +A .if \offset >= 256 +A ldr \group\index0, [\base, #\offset] +A ldr \group\index1, [\base, #(\offset) + 4] +A .else + ldrd \group\index0, \group\index1, [\base, #\offset] +A .endif +.endm + +.macro multiply index, accumulate, long + .altmacro + multiply_ %(\index), \accumulate, \long + .noaltmacro +.endm + +.macro multiply_ index, accumulate, long + .if \long + .if \accumulate + smlal AC0, AC1, CO\index, ST\index + .else + smull AC0, AC1, CO\index, ST\index + .endif + .else + .if \accumulate + mla AC0, CO\index, ST\index, AC0 + .else + mul AC0, CO\index, ST\index + .endif + .endif +.endm + +// A macro to update the load register number and load offsets + +.macro inc howmany + .set LOAD_REG, (LOAD_REG + \howmany) & 3 + .set OFFSET_CO, OFFSET_CO + 4 * \howmany + .set OFFSET_ST, OFFSET_ST + 4 * \howmany + .if FIR_REMAIN > 0 + .set FIR_REMAIN, FIR_REMAIN - \howmany + .if FIR_REMAIN == 0 + .set OFFSET_CO, 4 * MAX_FIR_ORDER + .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) + .endif + .elseif IIR_REMAIN > 0 + .set IIR_REMAIN, IIR_REMAIN - \howmany + .endif +.endm + +// Macro to implement the inner loop for one specific combination of parameters + +.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps + .set TOTAL_TAPS, \iir_taps + \fir_taps + + // Deal with register allocation... + .set DEFINED_SHIFT, 0 + .set DEFINED_MASK, 0 + .set SHUFFLE_SHIFT, 0 + .set SHUFFLE_MASK, 0 + .set SPILL_SHIFT, 0 + .set SPILL_MASK, 0 + .if TOTAL_TAPS == 0 + // Little register pressure in this case - just keep MASK where it was + .if !\mask_minus1 + MASK .req ST1 + .set DEFINED_MASK, 1 + .endif + .else + .if \shift_0 + .if !\mask_minus1 + // AC1 is unused with shift 0 + MASK .req AC1 + .set DEFINED_MASK, 1 + .set SHUFFLE_MASK, 1 + .endif + .elseif \shift_8 + .if !\mask_minus1 + .if TOTAL_TAPS <= 4 + // All coefficients are preloaded (so pointer not needed) + MASK .req PCO + .set DEFINED_MASK, 1 + .set SHUFFLE_MASK, 1 + .else + .set SPILL_MASK, 1 + .endif + .endif + .else // shift not 0 or 8 + .if TOTAL_TAPS <= 3 + // All coefficients are preloaded, and at least one CO register is unused + .if \fir_taps & 1 + SHIFT .req CO0 + .set DEFINED_SHIFT, 1 + .set SHUFFLE_SHIFT, 1 + .else + SHIFT .req CO3 + .set DEFINED_SHIFT, 1 + .set SHUFFLE_SHIFT, 1 + .endif + .if !\mask_minus1 + MASK .req PCO + .set DEFINED_MASK, 1 + .set SHUFFLE_MASK, 1 + .endif + .elseif TOTAL_TAPS == 4 + // All coefficients are preloaded + SHIFT .req PCO + .set DEFINED_SHIFT, 1 + .set SHUFFLE_SHIFT, 1 + .if !\mask_minus1 + .set SPILL_MASK, 1 + .endif + .else + .set SPILL_SHIFT, 1 + .if !\mask_minus1 + .set SPILL_MASK, 1 + .endif + .endif + .endif + .endif + .if SPILL_SHIFT + SHIFT .req ST0 + .set DEFINED_SHIFT, 1 + .endif + .if SPILL_MASK + MASK .req ST1 + .set DEFINED_MASK, 1 + .endif + + // Preload coefficients if possible + .if TOTAL_TAPS <= 4 + .set OFFSET_CO, 0 + .if \fir_taps & 1 + .set LOAD_REG, 1 + .else + .set LOAD_REG, 0 + .endif + .rept \fir_taps + load CO, LOAD_REG, PCO, OFFSET_CO + .set LOAD_REG, (LOAD_REG + 1) & 3 + .set OFFSET_CO, OFFSET_CO + 4 + .endr + .set OFFSET_CO, 4 * MAX_FIR_ORDER + .rept \iir_taps + load CO, LOAD_REG, PCO, OFFSET_CO + .set LOAD_REG, (LOAD_REG + 1) & 3 + .set OFFSET_CO, OFFSET_CO + 4 + .endr + .endif + + // Move mask/shift to final positions if necessary + // Need to do this after preloading, because in some cases we + // reuse the coefficient pointer register + .if SHUFFLE_SHIFT + mov SHIFT, ST0 + .endif + .if SHUFFLE_MASK + mov MASK, ST1 + .endif + + // Begin loop +01: + .if TOTAL_TAPS == 0 + // Things simplify a lot in this case + // In fact this could be pipelined further if it's worth it... + ldr ST0, [PSAMP] + subs I, I, #1 + .if !\mask_minus1 + and ST0, ST0, MASK + .endif + str ST0, [PST, #-4]! + str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] + str ST0, [PSAMP], #4 * MAX_CHANNELS + bne 01b + .else + .if \fir_taps & 1 + .set LOAD_REG, 1 + .else + .set LOAD_REG, 0 + .endif + .set LOAD_BANK, 0 + .set FIR_REMAIN, \fir_taps + .set IIR_REMAIN, \iir_taps + .if FIR_REMAIN == 0 // only IIR terms + .set OFFSET_CO, 4 * MAX_FIR_ORDER + .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) + .else + .set OFFSET_CO, 0 + .set OFFSET_ST, 0 + .endif + .set MUL_REG, LOAD_REG + .set COUNTER, 0 + .rept TOTAL_TAPS + 2 + // Do load(s) + .if FIR_REMAIN != 0 || IIR_REMAIN != 0 + .if COUNTER == 0 + .if TOTAL_TAPS > 4 + load CO, LOAD_REG, PCO, OFFSET_CO + .endif + load ST, LOAD_REG, PST, OFFSET_ST + inc 1 + .elseif COUNTER == 1 && (\fir_taps & 1) == 0 + .if TOTAL_TAPS > 4 + load CO, LOAD_REG, PCO, OFFSET_CO + .endif + load ST, LOAD_REG, PST, OFFSET_ST + inc 1 + .elseif LOAD_BANK == 0 + .if TOTAL_TAPS > 4 + .if FIR_REMAIN == 0 && IIR_REMAIN == 1 + load CO, LOAD_REG, PCO, OFFSET_CO + .else + loadd CO, LOAD_REG, PCO, OFFSET_CO + .endif + .endif + .set LOAD_BANK, 1 + .else + .if FIR_REMAIN == 0 && IIR_REMAIN == 1 + load ST, LOAD_REG, PST, OFFSET_ST + inc 1 + .else + loadd ST, LOAD_REG, PST, OFFSET_ST + inc 2 + .endif + .set LOAD_BANK, 0 + .endif + .endif + + // Do interleaved multiplies, slightly delayed + .if COUNTER >= 2 + multiply MUL_REG, COUNTER > 2, !\shift_0 + .set MUL_REG, (MUL_REG + 1) & 3 + .endif + .set COUNTER, COUNTER + 1 + .endr + + // Post-process the result of the multiplies + .if SPILL_SHIFT + ldr SHIFT, [sp, #9*4 + 0*4] + .endif + .if SPILL_MASK + ldr MASK, [sp, #9*4 + 1*4] + .endif + ldr ST2, [PSAMP] + subs I, I, #1 + .if \shift_8 + mov AC0, AC0, lsr #8 + orr AC0, AC0, AC1, lsl #24 + .elseif !\shift_0 + rsb ST3, SHIFT, #32 + mov AC0, AC0, lsr SHIFT +A orr AC0, AC0, AC1, lsl ST3 +T mov AC1, AC1, lsl ST3 +T orr AC0, AC0, AC1 + .endif + .if \mask_minus1 + add ST3, ST2, AC0 + .else + add ST2, ST2, AC0 + and ST3, ST2, MASK + sub ST2, ST3, AC0 + .endif + str ST3, [PST, #-4]! + str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] + str ST3, [PSAMP], #4 * MAX_CHANNELS + bne 01b + .endif + b 99f + + .if DEFINED_SHIFT + .unreq SHIFT + .endif + .if DEFINED_MASK + .unreq MASK + .endif +.endm + +.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps +A ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps) +T tbh [pc, a3, lsl #1] +0: +A .word 0, 70f, 71f, 72f, 73f, 74f +T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2 + .if \iir_taps <= 3 +A .word 75f +T .hword (75f - 0b) / 2 + .if \iir_taps <= 2 +A .word 76f +T .hword (76f - 0b) / 2 + .if \iir_taps <= 1 +A .word 77f +T .hword (77f - 0b) / 2 + .if \iir_taps == 0 +A .word 78f +T .hword (78f - 0b) / 2 + .endif + .endif + .endif + .endif +70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0 +71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1 +72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2 +73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3 +74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4 + .if \iir_taps <= 3 +75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5 + .if \iir_taps <= 2 +76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6 + .if \iir_taps <= 1 +77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7 + .if \iir_taps == 0 +78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8 + .endif + .endif + .endif + .endif +.endm + +.macro switch_on_iir_taps mask_minus1, shift_0, shift_8 +A ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4 +T tbh [pc, a4, lsl #1] +0: +A .word 0, 60f, 61f, 62f, 63f, 64f +T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2 +60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0 +61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1 +62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2 +63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3 +64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4 +.endm + +/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, + * int firorder, int iirorder, + * unsigned int filter_shift, int32_t mask, + * int blocksize, int32_t *sample_buffer); + */ +function ff_mlp_filter_channel_arm, export=1 + push {v1-fp,lr} + add v1, sp, #9*4 // point at arguments on stack + ldm v1, {ST0,ST1,I,PSAMP} + cmp ST1, #-1 + bne 30f + movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 + bne 20f + bcs 10f + switch_on_iir_taps 1, 1, 0 +10: switch_on_iir_taps 1, 0, 1 +20: switch_on_iir_taps 1, 0, 0 +30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 + bne 50f + bcs 40f + switch_on_iir_taps 0, 1, 0 +40: switch_on_iir_taps 0, 0, 1 +50: switch_on_iir_taps 0, 0, 0 +99: pop {v1-fp,pc} +endfunc + + .unreq PST + .unreq PCO + .unreq AC0 + .unreq AC1 + .unreq CO0 + .unreq CO1 + .unreq CO2 + .unreq CO3 + .unreq ST0 + .unreq ST1 + .unreq ST2 + .unreq ST3 + .unreq I + .unreq PSAMP + +/********************************************************************/ + +PSA .req a1 // samples +PCO .req a2 // coeffs +PBL .req a3 // bypassed_lsbs +INDEX .req a4 +CO0 .req v1 +CO1 .req v2 +CO2 .req v3 +CO3 .req v4 +SA0 .req v5 +SA1 .req v6 +SA2 .req sl +SA3 .req fp +AC0 .req ip +AC1 .req lr +NOISE .req SA0 +LSB .req SA1 +DCH .req SA2 // dest_ch +MASK .req SA3 + + // INDEX is used as follows: + // bits 0..6 index2 (values up to 17, but wider so that we can + // add to index field without needing to mask) + // bits 7..14 i (values up to 160) + // bit 15 underflow detect for i + // bits 25..31 (if access_unit_size_pow2 == 128) \ index + // bits 26..31 (if access_unit_size_pow2 == 64) / + +.macro implement_rematrix shift, index_mask, mask_minus1, maxchan + .if \maxchan == 1 + // We can just leave the coefficients in registers in this case + ldrd CO0, CO1, [PCO] + .endif +1: + .if \maxchan == 1 + ldrd SA0, SA1, [PSA] + smull AC0, AC1, CO0, SA0 + .elseif \maxchan == 5 + ldr CO0, [PCO, #0] + ldr SA0, [PSA, #0] + ldr CO1, [PCO, #4] + ldr SA1, [PSA, #4] + ldrd CO2, CO3, [PCO, #8] + smull AC0, AC1, CO0, SA0 + ldrd SA2, SA3, [PSA, #8] + smlal AC0, AC1, CO1, SA1 + ldrd CO0, CO1, [PCO, #16] + smlal AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #16] + smlal AC0, AC1, CO3, SA3 + smlal AC0, AC1, CO0, SA0 + .else // \maxchan == 7 + ldr CO2, [PCO, #0] + ldr SA2, [PSA, #0] + ldr CO3, [PCO, #4] + ldr SA3, [PSA, #4] + ldrd CO0, CO1, [PCO, #8] + smull AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #8] + smlal AC0, AC1, CO3, SA3 + ldrd CO2, CO3, [PCO, #16] + smlal AC0, AC1, CO0, SA0 + ldrd SA2, SA3, [PSA, #16] + smlal AC0, AC1, CO1, SA1 + ldrd CO0, CO1, [PCO, #24] + smlal AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #24] + smlal AC0, AC1, CO3, SA3 + smlal AC0, AC1, CO0, SA0 + .endif + ldm sp, {NOISE, DCH, MASK} + smlal AC0, AC1, CO1, SA1 + .if \shift != 0 + .if \index_mask == 63 + add NOISE, NOISE, INDEX, lsr #32-6 + ldrb LSB, [PBL], #MAX_CHANNELS + ldrsb NOISE, [NOISE] + add INDEX, INDEX, INDEX, lsl #32-6 + .else // \index_mask == 127 + add NOISE, NOISE, INDEX, lsr #32-7 + ldrb LSB, [PBL], #MAX_CHANNELS + ldrsb NOISE, [NOISE] + add INDEX, INDEX, INDEX, lsl #32-7 + .endif + sub INDEX, INDEX, #1<<7 + adds AC0, AC0, NOISE, lsl #\shift + 7 + adc AC1, AC1, NOISE, asr #31 + .else + ldrb LSB, [PBL], #MAX_CHANNELS + sub INDEX, INDEX, #1<<7 + .endif + add PSA, PSA, #MAX_CHANNELS*4 + mov AC0, AC0, lsr #14 + orr AC0, AC0, AC1, lsl #18 + .if !\mask_minus1 + and AC0, AC0, MASK + .endif + add AC0, AC0, LSB + tst INDEX, #1<<15 + str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA + beq 1b + b 98f +.endm + +.macro switch_on_maxchan shift, index_mask, mask_minus1 + cmp v4, #5 + blo 51f + beq 50f + implement_rematrix \shift, \index_mask, \mask_minus1, 7 +50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 +51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 +.endm + +.macro switch_on_mask shift, index_mask + cmp sl, #-1 + bne 40f + switch_on_maxchan \shift, \index_mask, 1 +40: switch_on_maxchan \shift, \index_mask, 0 +.endm + +.macro switch_on_au_size shift + .if \shift == 0 + switch_on_mask \shift, undefined + .else + teq v6, #64 + bne 30f + orr INDEX, INDEX, v1, lsl #32-6 + switch_on_mask \shift, 63 +30: orr INDEX, INDEX, v1, lsl #32-7 + switch_on_mask \shift, 127 + .endif +.endm + +/* void ff_mlp_rematrix_channel_arm(int32_t *samples, + * const int32_t *coeffs, + * const uint8_t *bypassed_lsbs, + * const int8_t *noise_buffer, + * int index, + * unsigned int dest_ch, + * uint16_t blockpos, + * unsigned int maxchan, + * int matrix_noise_shift, + * int access_unit_size_pow2, + * int32_t mask); + */ +function ff_mlp_rematrix_channel_arm, export=1 + push {v1-fp,lr} + add v1, sp, #9*4 // point at arguments on stack + ldm v1, {v1-sl} + teq v4, #1 + itt ne + teqne v4, #5 + teqne v4, #7 + bne 99f + teq v6, #64 + it ne + teqne v6, #128 + bne 99f + sub v2, v2, #MAX_CHANNELS + push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned + movs INDEX, v3, lsl #7 + beq 98f // just in case, do nothing if blockpos = 0 + subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time + adc lr, v1, v1 // calculate index2 (C was set by preceding subs) + orr INDEX, INDEX, lr + // Switch on matrix_noise_shift: values 0 and 1 are + // disproportionately common so do those in a form the branch + // predictor can accelerate. Values can only go up to 15. + cmp v5, #1 + beq 11f + blo 10f +A ldr pc, [pc, v5, lsl #2] +T tbh [pc, v5, lsl #1] +0: +A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f +T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2 +T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2 +T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2 +10: switch_on_au_size 0 +11: switch_on_au_size 1 +12: switch_on_au_size 2 +13: switch_on_au_size 3 +14: switch_on_au_size 4 +15: switch_on_au_size 5 +16: switch_on_au_size 6 +17: switch_on_au_size 7 +18: switch_on_au_size 8 +19: switch_on_au_size 9 +20: switch_on_au_size 10 +21: switch_on_au_size 11 +22: switch_on_au_size 12 +23: switch_on_au_size 13 +24: switch_on_au_size 14 +25: switch_on_au_size 15 + +98: add sp, sp, #3*4 + pop {v1-fp,pc} +99: // Can't handle these parameters, drop back to C + pop {v1-fp,lr} + b X(ff_mlp_rematrix_channel) +endfunc + + .unreq PSA + .unreq PCO + .unreq PBL + .unreq INDEX + .unreq CO0 + .unreq CO1 + .unreq CO2 + .unreq CO3 + .unreq SA0 + .unreq SA1 + .unreq SA2 + .unreq SA3 + .unreq AC0 + .unreq AC1 + .unreq NOISE + .unreq LSB + .unreq DCH + .unreq MASK |