diff options
Diffstat (limited to 'chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S')
-rw-r--r-- | chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S | 371 |
1 files changed, 371 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S new file mode 100644 index 00000000000..2fc2e604f78 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S @@ -0,0 +1,371 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s +// to support float instead of SC32. +// + +// +// Description: +// Compute a Radix 4 FFT stage for a N point complex signal +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +// Import symbols required from other files +// (For example tables) + + + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + +// Guarding implementation by the processor name + + +// Import symbols required from other files +// (For example tables) + //IMPORT armAAC_constTable + +//Input Registers + +#define pSrc x0 +#define pDst x1 +#define pTwiddle x2 +#define pSubFFTNum x3 +#define pSubFFTSize x4 + + + +//Output Registers + + +//Local Scratch Registers + +#define subFFTNum x5 +#define subFFTSize x6 +#define outPointStep x8 +#define grpCount x9 +#define dstStep x10 +#define grpTwStep x13 +#define stepTwiddle x14 +#define twStep x15 +#define step16 x11 +#define step24 x12 + + +// Neon Registers + +#define dButterfly1Real02 v0.2s +#define dButterfly1Real028b v0.8b +#define dButterfly1Imag02 v1.2s +#define dButterfly1Imag028b v1.8b +#define dButterfly1Real13 v2.2s +#define dButterfly1Real138b v2.8b +#define dButterfly1Imag13 v3.2s +#define dButterfly1Imag138b v3.8b +#define dButterfly2Real02 v4.2s +#define dButterfly2Imag02 v5.2s +#define dButterfly2Real13 v6.2s +#define dButterfly2Imag13 v7.2s +#define dXr0 v0.2s +#define dXi0 v1.2s +#define dXr08b v0.8b +#define dXi08b v1.8b +#define dXr1 v2.2s +#define dXi1 v3.2s +#define dXr2 v4.2s +#define dXi2 v5.2s +#define dXr3 v6.2s +#define dXi3 v7.2s + +#define dYr0 v16.2s +#define dYi0 v17.2s +#define dYr1 v18.2s +#define dYi1 v19.2s +#define dYr2 v20.2s +#define dYi2 v21.2s +#define dYr3 v22.2s +#define dYi3 v23.2s + +#define dW1r v8.2s +#define dW1i v9.2s +#define dW2r v10.2s +#define dW2r8b v10.8b +#define dW2i v11.2s +#define dW3r v12.2s +#define dW3r8b v12.8b +#define dW3i v13.2s + +#define dZr0 v14.2s +#define dZi0 v15.2s +#define dZr08b v14.8b +#define dZi08b v15.8b +#define dZr1 v26.2s +#define dZi1 v27.2s +#define dZr2 v28.2s +#define dZi2 v29.2s +#define dZr3 v30.2s +#define dZi3 v31.2s + +#define dZip v24.2s +#define dZip8b v24.8b + + .MACRO FFTSTAGE scaled, inverse , name + + // Define stack arguments + + // Move args values into our work registers + ldr subFFTNum, [pSubFFTNum] + ldr subFFTSize, [pSubFFTSize] + + // pOut0+1 increments pOut0 by 8 bytes + // pOut0+outPointStep == increment of 8*outPointStep bytes + lsl outPointStep,subFFTSize, #3 + + // Update grpCount and grpSize rightaway + + ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr] + MOV step16,#16 + LSL grpCount,subFFTSize,#2 + + ld1 {dW2r},[pTwiddle] // [wi|wr] + MOV subFFTNum,#1 //after the last stage + + ld1 {dW3r},[pTwiddle],step16 // [wi|wr] + MOV stepTwiddle,#0 + + ld1 {dW2i},[pTwiddle],#8 // [wi|wr] + SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with + + // update subFFTSize for the next stage + MOV subFFTSize,grpCount + ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] + lsl dstStep,outPointStep, #1 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 + ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep + + rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16 + MOV step24,#24 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 + + + // Process two groups at a time + +radix4lsGrpLoop\name : + + // VZIP dW2r,dW2i + zip1 dZip, dW2r, dW2i + zip2 dW2i, dW2r, dW2i + mov dW2r8b, dZip8b + + ADD stepTwiddle,stepTwiddle,#16 + + // VZIP dW3r,dW3i + zip1 dZip, dW3r,dW3i + zip2 dW3i, dW3r, dW3i + mov dW3r8b, dZip8b + ADD grpTwStep,stepTwiddle,#4 + + // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r + uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r + uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r + mov dButterfly1Real138b, dZip8b + + SUB twStep,stepTwiddle,#16 // -16+stepTwiddle + + // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i + mov dButterfly1Imag138b, dZip8b + lsl grpTwStep,grpTwStep,#1 + + // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r + uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r + uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r + mov dButterfly1Real028b, dZip8b + rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle + + // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i + mov dButterfly1Imag028b, dZip8b + + + // grpCount is multiplied by 4 + SUBS grpCount,grpCount,#8 + + .ifeqs "\inverse", "TRUE" + fmul dZr1,dW1r,dXr1 + fmla dZr1,dW1i,dXi1 // real part + fmul dZi1,dW1r,dXi1 + fmls dZi1,dW1i,dXr1 // imag part + + .else + + fmul dZr1,dW1r,dXr1 + fmls dZr1,dW1i,dXi1 // real part + fmul dZi1,dW1r,dXi1 + fmla dZi1,dW1i,dXr1 // imag part + + .endif + + ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr] + + .ifeqs "\inverse", "TRUE" + fmul dZr2,dW2r,dXr2 + fmla dZr2,dW2i,dXi2 // real part + fmul dZi2,dW2r,dXi2 + ld1 {dW2r},[pTwiddle],step16 // [wi|wr] + fmls dZi2,dW2i,dXr2 // imag part + + .else + + fmul dZr2,dW2r,dXr2 + fmls dZr2,dW2i,dXi2 // real part + fmul dZi2,dW2r,dXi2 + ld1 {dW2r},[pTwiddle],step16 // [wi|wr] + fmla dZi2,dW2i,dXr2 // imag part + + .endif + + + ld1 {dW2i},[pTwiddle],twStep // [wi|wr] + + // move qX0 so as to load for the next iteration + // MOV qZ0,qX0 + mov dZr08b, dXr08b + mov dZi08b, dXi08b + + .ifeqs "\inverse", "TRUE" + fmul dZr3,dW3r,dXr3 + fmla dZr3,dW3i,dXi3 // real part + fmul dZi3,dW3r,dXi3 + ld1 {dW3r},[pTwiddle],step24 + fmls dZi3,dW3i,dXr3 // imag part + + .else + + fmul dZr3,dW3r,dXr3 + fmls dZr3,dW3i,dXi3 // real part + fmul dZi3,dW3r,dXi3 + ld1 {dW3r},[pTwiddle],step24 + fmla dZi3,dW3i,dXr3 // imag part + + .endif + + ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] + + // Don't do the load on the last iteration so we don't read past the end + // of pSrc. + bne skipIncrement\name + add pSrc, pSrc, #64 +skipIncrement\name: + beq radix4lsSkipRead\name + // AC.r AC.i BD.r BD.i + ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 + + // AC.r AC.i BD.r BD.i + ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 +radix4lsSkipRead\name: + + // finish first stage of 4 point FFT + + // fadd qY0,qZ0,qZ2 + fadd dYr0,dZr0,dZr2 + fadd dYi0,dZi0,dZi2 + // fsub qY2,qZ0,qZ2 + fsub dYr2,dZr0,dZr2 + fsub dYi2,dZi0,dZi2 + // fadd qY1,qZ1,qZ3 + fadd dYr1,dZr1,dZr3 + fadd dYi1,dZi1,dZi3 + // fsub qY3,qZ1,qZ3 + fsub dYr3,dZr1,dZr3 + fsub dYi3,dZi1,dZi3 + + + // finish second stage of 4 point FFT + + .ifeqs "\inverse", "TRUE" + + // fsub qZ0,qY2,qY1 + fsub dZr0,dYr2,dYr1 + fsub dZi0,dYi2,dYi1 + fadd dZr3,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr3,dZi3},[pDst],outPointStep + + fsub dZr1,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + // dstStep = -outPointStep + 16 + st2 {dZr1,dZi1},[pDst],dstStep + + + .else + + // fsub qZ0,qY2,qY1 + fsub dZr0,dYr2,dYr1 + fsub dZi0,dYi2,dYi1 + + fsub dZr1,dYr0,dYi3 + st2 {dZr0,dZi0},[pDst],outPointStep + fadd dZi1,dYi0,dYr3 + + // fadd qZ2,qY2,qY1 + fadd dZr2,dYr2,dYr1 + fadd dZi2,dYi2,dYi1 + + st2 {dZr1,dZi1},[pDst],outPointStep + + fadd dZr3,dYr0,dYi3 + st2 {dZr2,dZi2},[pDst],outPointStep + fsub dZi3,dYi0,dYr3 + + // dstStep = -outPointStep + 16 + st2 {dZr3,dZi3},[pDst],dstStep + + + .endif + + BGT radix4lsGrpLoop\name + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15 + FFTSTAGE "FALSE","FALSE",fwd + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15 + FFTSTAGE "FALSE","TRUE",inv + M_END + + + .end |