diff options
Diffstat (limited to 'chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S')
-rw-r--r-- | chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S new file mode 100644 index 00000000000..f348e6a975c --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S @@ -0,0 +1,473 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a first stage Radix 8 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +// Import symbols required from other files +// (For example tables) + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + +// Guarding implementation by the processor name + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define grpSize x7 +// Reuse grpSize as setCount +#define setCount x7 +#define pointStep x8 +#define outPointStep x8 +#define setStep x9 +#define step1 x10 +#define step2 x11 +#define t0 w12 + + +// Neon Registers + +#define dXr0 v0.2s +#define dXi0 v1.2s +#define dXr1 v2.2s +#define dXi1 v3.2s +#define dXr2 v4.2s +#define dXi2 v5.2s +#define dXr3 v6.2s +#define dXi3 v7.2s +#define dXr4 v8.2s +#define dXi4 v9.2s +#define dXr5 v10.2s +#define dXi5 v11.2s +#define dXr6 v12.2s +#define dXi6 v13.2s +#define dXr7 v14.2s +#define dXi7 v15.2s +#define qX0 v0.4s +#define qX1 v1.4s +#define qX2 v2.4s +#define qX3 v3.4s +#define qX4 v4.4s +#define qX5 v5.4s +#define qX6 v6.4s +#define qX7 v7.4s + +#define dUr0 v16.2s +#define dUi0 v17.2s +#define dUr2 v18.2s +#define dUi2 v19.2s +#define dUr4 v20.2s +#define dUi4 v21.2s +#define dUr6 v22.2s +#define dUi6 v23.2s +#define dUr1 v24.2s +#define dUi1 v25.2s +#define dUr3 v26.2s +#define dUi3 v27.2s +#define dUr5 v28.2s +#define dUi5 v29.2s +// reuse dXr7 and dXi7 +#define dUr7 v30.2s +#define dUi7 v31.2s +#define qU0 v8.4s +#define qU1 v12.4s +#define qU2 v9.4s +#define qU3 v13.4s +#define qU4 v10.4s +#define qU5 v14.4s +#define qU6 v11.4s +#define qU7 v15.4s + + +#define dVr0 v24.2s +#define dVi0 v25.2s +#define dVr2 v26.2s +#define dVi2 v27.2s +#define dVr4 v28.2s +#define dVi4 v29.2s +#define dVr6 v30.2s +#define dVi6 v31.2s +#define dVr1 v16.2s +#define dVi1 v17.2s +#define dVr3 v18.2s +#define dVi3 v19.2s +#define dVr5 v20.2s +#define dVi5 v21.2s +#define dVr7 v22.2s +#define dVi7 v23.2s +#define qV0 v12.4s +#define qV1 v8.4s +#define qV2 v13.4s +#define qV3 v9.4s +#define qV4 v14.4s +#define qV5 v10.4s +#define qV6 v15.4s +#define qV7 v11.4s + +#define dYr0 v16.2s +#define dYi0 v17.2s +#define dYr2 v18.2s +#define dYi2 v19.2s +#define dYr4 v20.2s +#define dYi4 v21.2s +#define dYr6 v22.2s +#define dYi6 v23.2s +#define dYr1 v24.2s +#define dYi1 v25.2s +#define dYr3 v26.2s +#define dYi3 v27.2s +#define dYr5 v28.2s +#define dYi5 v29.2s +#define dYr7 v30.2s +#define dYi7 v31.2s +#define qY0 v8.4s +#define qY1 v12.4s +#define qY2 v9.4s +#define qY3 v13.4s +#define qY4 v10.4s +#define qY5 v14.4s +#define qY6 v11.4s +#define qY7 v15.4s + +#define dT0 v14.2s +#define dT0s v14.s +#define dT1 v15.2s + + .MACRO FFTSTAGE scaled, inverse, name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // Update pSubFFTSize and pSubFFTNum regs + // subFFTSize = 1 for the first stage + + movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2). + movk t0, 0x04f3 // Low half word of sqrt(1/2). + MOV subFFTSize,#8 + + // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount) + LSR grpSize,subFFTNum,#3 + MOV subFFTNum,grpSize + + + // pT0+1 increments pT0 by 8 bytes + // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes + // Note: outPointStep = pointStep for firststage + + lsl pointStep,grpSize, #3 + + + // Calculate the step of input data for the next set + //MOV step1,pointStep,LSL #1 // step1 = 2*pointStep + ld2 {dXr0,dXi0},[pSrc],pointStep // data[0] + lsl step1,grpSize, #4 + lsl step2,pointStep, #3 + + ld2 {dXr1,dXi1},[pSrc],pointStep // data[1] + SUB step2,step2,pointStep // step2 = 7*pointStep + // setStep = - 7*pointStep+16 + rsb setStep,step2,#16 + + ld2 {dXr2,dXi2},[pSrc],pointStep // data[2] + ld2 {dXr3,dXi3},[pSrc],pointStep // data[3] + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + // data[7] & update pSrc for the next set + // setStep = -7*pointStep + 16 + ld2 {dXr7,dXi7},[pSrc],setStep + // grp = 0 a special case since all the twiddle factors are 1 + // Loop on the sets + +radix8fsGrpZeroSetLoop\name : + + // Decrement setcount + SUBS setCount,setCount,#2 + + + // finish first stage of 8 point FFT + + // fadd qU0,qX0,qX4 + // fadd qU2,qX1,qX5 + // fadd qU4,qX2,qX6 + // fadd qU6,qX3,qX7 + fadd dUr0,dXr0,dXr4 + fadd dUr2,dXr1,dXr5 + fadd dUr4,dXr2,dXr6 + fadd dUr6,dXr3,dXr7 + fadd dUi0,dXi0,dXi4 + fadd dUi2,dXi1,dXi5 + fadd dUi4,dXi2,dXi6 + fadd dUi6,dXi3,dXi7 + + // finish second stage of 8 point FFT + + // fadd qV0,qU0,qU4 + // fsub qV2,qU0,qU4 + // fadd qV4,qU2,qU6 + // fsub qV6,qU2,qU6 + fadd dVr0,dUr0,dUr4 + fsub dVr2,dUr0,dUr4 + fadd dVr4,dUr2,dUr6 + fsub dVr6,dUr2,dUr6 + fadd dVi0,dUi0,dUi4 + fsub dVi2,dUi0,dUi4 + fadd dVi4,dUi2,dUi6 + fsub dVi6,dUi2,dUi6 + + // finish third stage of 8 point FFT + + // fadd qY0,qV0,qV4 + // fsub qY4,qV0,qV4 + fadd dYr0,dVr0,dVr4 + fsub dYr4,dVr0,dVr4 + fadd dYi0,dVi0,dVi4 + fsub dYi4,dVi0,dVi4 + + st2 {dYr0,dYi0},[pDst],step1 // store y0 + + .ifeqs "\inverse", "TRUE" + + fsub dYr2,dVr2,dVi6 + fadd dYi2,dVi2,dVr6 + + fadd dYr6,dVr2,dVi6 + st2 {dYr2,dYi2},[pDst],step1 // store y2 + fsub dYi6,dVi2,dVr6 + + // fsub qU1,qX0,qX4 + fsub dUr1,dXr0,dXr4 + fsub dUi1,dXi0,dXi4 + + st2 {dYr4,dYi4},[pDst],step1 // store y4 + + // fsub qU3,qX1,qX5 + // fsub qU5,qX2,qX6 + fsub dUr3,dXr1,dXr5 + fsub dUr5,dXr2,dXr6 + fsub dUi3,dXi1,dXi5 + fsub dUi5,dXi2,dXi6 + + st2 {dYr6,dYi6},[pDst],step1 // store y6 + + .ELSE + + fadd dYr6,dVr2,dVi6 + fsub dYi6,dVi2,dVr6 + + fsub dYr2,dVr2,dVi6 + st2 {dYr6,dYi6},[pDst],step1 // store y2 + fadd dYi2,dVi2,dVr6 + + + // fsub qU1,qX0,qX4 + fsub dUr1,dXr0,dXr4 + fsub dUi1,dXi0,dXi4 + + st2 {dYr4,dYi4},[pDst],step1 // store y4 + + // fsub qU3,qX1,qX5 + // fsub qU5,qX2,qX6 + fsub dUr3,dXr1,dXr5 + fsub dUr5,dXr2,dXr6 + fsub dUi3,dXi1,dXi5 + fsub dUi5,dXi2,dXi6 + + st2 {dYr2,dYi2},[pDst],step1 // store y6 + + + .ENDIF + + // finish first stage of 8 point FFT + + // fsub qU7,qX3,qX7 + fsub dUr7,dXr3,dXr7 + fsub dUi7,dXi3,dXi7 + + mov dT0s[0], t0 + + // finish second stage of 8 point FFT + + fsub dVr1,dUr1,dUi5 + // data[0] for next iteration + ld2 {dXr0,dXi0},[pSrc],pointStep + fadd dVi1,dUi1,dUr5 + fadd dVr3,dUr1,dUi5 + ld2 {dXr1,dXi1},[pSrc],pointStep // data[1] + fsub dVi3,dUi1,dUr5 + + fsub dVr5,dUr3,dUi7 + ld2 {dXr2,dXi2},[pSrc],pointStep // data[2] + fadd dVi5,dUi3,dUr7 + fadd dVr7,dUr3,dUi7 + ld2 {dXr3,dXi3},[pSrc],pointStep // data[3] + fsub dVi7,dUi3,dUr7 + + // finish third stage of 8 point FFT + + .ifeqs "\inverse", "TRUE" + + // calculate a*v5 + fmul dT1,dVr5,dT0[0] // use dVi0 for dT1 + + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + fmul dVi5,dVi5,dT0[0] + + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + fsub dVr5,dT1,dVi5 // a * V5 + fadd dVi5,dT1,dVi5 + + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + + // calculate b*v7 + fmul dT1,dVr7,dT0[0] + fmul dVi7,dVi7,dT0[0] + + // fadd qY1,qV1,qV5 + // fsub qY5,qV1,qV5 + fadd dYr1,dVr1,dVr5 + fsub dYr5,dVr1,dVr5 + fadd dYi1,dVi1,dVi5 + fsub dYi5,dVi1,dVi5 + + fadd dVr7,dT1,dVi7 // b * V7 + fsub dVi7,dVi7,dT1 + SUB pDst, pDst, step2 // set pDst to y1 + + // On the last iteration, this will read past the end of pSrc, + // so skip this read. + BEQ radix8SkipLastUpdateInv\name + ld2 {dXr7,dXi7},[pSrc],setStep // data[7] +radix8SkipLastUpdateInv\name: + + fsub dYr3,dVr3,dVr7 + fsub dYi3,dVi3,dVi7 + st2 {dYr1,dYi1},[pDst],step1 // store y1 + fadd dYr7,dVr3,dVr7 + fadd dYi7,dVi3,dVi7 + + + st2 {dYr3,dYi3},[pDst],step1 // store y3 + st2 {dYr5,dYi5},[pDst],step1 // store y5 + st2 {dYr7,dYi7},[pDst] // store y7 + ADD pDst, pDst, #16 + + .ELSE + + // calculate b*v7 + fmul dT1,dVr7,dT0[0] + ld2 {dXr4,dXi4},[pSrc],pointStep // data[4] + fmul dVi7,dVi7,dT0[0] + + ld2 {dXr5,dXi5},[pSrc],pointStep // data[5] + fadd dVr7,dT1,dVi7 // b * V7 + fsub dVi7,dVi7,dT1 + + ld2 {dXr6,dXi6},[pSrc],pointStep // data[6] + + // calculate a*v5 + fmul dT1,dVr5,dT0[0] // use dVi0 for dT1 + fmul dVi5,dVi5,dT0[0] + + fadd dYr7,dVr3,dVr7 + fadd dYi7,dVi3,dVi7 + SUB pDst, pDst, step2 // set pDst to y1 + + fsub dVr5,dT1,dVi5 // a * V5 + fadd dVi5,dT1,dVi5 + + // On the last iteration, this will read past the end of pSrc, + // so skip this read. + BEQ radix8SkipLastUpdateFwd\name + ld2 {dXr7,dXi7},[pSrc],setStep // data[7] +radix8SkipLastUpdateFwd\name: + + // fsub qY5,qV1,qV5 + fsub dYr5,dVr1,dVr5 + fsub dYi5,dVi1,dVi5 + + fsub dYr3,dVr3,dVr7 + st2 {dYr7,dYi7},[pDst],step1 // store y1 + fsub dYi3,dVi3,dVi7 + + // fadd qY1,qV1,qV5 + fadd dYr1,dVr1,dVr5 + fadd dYi1,dVi1,dVi5 + + st2 {dYr5,dYi5},[pDst],step1 // store y3 + st2 {dYr3,dYi3},[pDst],step1 // store y5 + st2 {dYr1,dYi1},[pDst],#16 // store y7 + + .ENDIF + + + // update pDst for the next set + SUB pDst, pDst, step2 + BGT radix8fsGrpZeroSetLoop\name + + // Save subFFTNum and subFFTSize for next stage + str subFFTNum, [pSubFFTNum] + str subFFTSize, [pSubFFTSize] + + .endm + + + // Allocate stack memory required by the function + + + M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",INV + M_END + + + + .end |