diff options
Diffstat (limited to 'chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S')
-rw-r--r-- | chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S new file mode 100644 index 00000000000..da68314be82 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S @@ -0,0 +1,280 @@ +// +// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// +// This is a modification of +// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float +// instead of SC32. +// + +// +// Description: +// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT +// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation +// +// + + +// Include standard headers + +#include "dl/api/arm/arm64COMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + + +// Import symbols required from other files +// (For example tables) + + +// Set debugging level +//DEBUG_ON SETL {TRUE} + + + +// Guarding implementation by the processor name + + + + // Guarding implementation by the processor name + + + +//Input Registers + +#define pSrc x0 +#define pTwiddle x1 +#define pOut x2 +#define subFFTNum x3 + +// Output registers + +//Local Scratch Registers + +#define argTwiddle x5 +#define argDst x6 +#define subFFTSize x7 +#define N subFFTNum + +#define pOut1 x13 + +#define size x7 +#define step x8 +#define step1 x9 +#define twStep x10 +#define pTwiddleTmp x11 +#define argTwiddle1 x12 + +// Neon registers + +#define dX0 v0.2s +#define dX0s v0.s +#define dShift v1.2s +#define dX1 v1.2s +#define dX1s v1.s +#define dY0 v2.2s +#define dY08b v2.8b +#define dY1 v3.2s +#define dX0r v0.2s +#define dX0rs v0.s +#define dX0i v1.2s +#define dX1r v2.2s +#define dX1i v3.2s +#define dW0r v4.2s +#define dW0r8b v4.8b +#define dW0i v5.2s +#define dW1r v6.2s +#define dW1r8b v6.8b +#define dW1i v7.2s +#define dT0 v8.2s +#define dT1 v9.2s +#define dT2 v10.2s +#define dT3 v11.2s +#define qT0 v12.2s +#define qT1 v14.2s +#define qT2 v16.2s +#define qT3 v18.2s +#define dY0r v4.2s +#define dY0i v5.2s +#define dY1r v6.2s +#define dY1i v7.2s + +#define dY2 v4.2s +#define dY3 v5.2s +#define dW0 v6.2s +#define dW1 v7.2s +#define dW0Tmp v10.2s +#define dW1Neg v11.2s + +#define dZip v19.2s +#define dZip8b v19.8b +#define half v13.2s + + .MACRO FFTSTAGE scaled, inverse, name + + fmov half, 0.5 + + asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum + lsl step, subFFTNum, #2 // step = N/2 * 8 bytes + + + // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} + // Note: W^(k) is stored as negated value and also need to + // conjugate the values from the table + + // Z(0) : no need of twiddle multiply + // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } + + ld1 {dX0},[pSrc],step + ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes + + ld1 {dX1},[pSrc], #8 + // twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,size,LSL #1 + + lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes + SUB step1,step1,#8 // (N/4-1)*8 bytes + + fadd dY0,dX0,dX1 // [b+d | a+c] + fsub dY1,dX0,dX1 // [b-d | a-c] + fmul dY0, dY0, half[0] + fmul dY1, dY1, half[0] + + // dY0= [a-c | a+c] ;dY1= [b-d | b+d] + // VZIP dY0,dY1 + zip1 dZip,dY0,dY1 + zip2 dY1,dY0,dY1 + mov dY08b, dZip8b + + fsub dX0,dY0,dY1 + SUBS size,size,#2 + fadd dX1,dY0,dY1 + + SUB pSrc,pSrc,step + + st1 {dX0s}[0],[pOut1], #4 + ADD pTwiddleTmp,pTwiddle,#8 // W^2 + st1 {dX1s}[1],[pOut1], #4 + ADD argTwiddle1,pTwiddle,twStep // W^1 + + + BLT decrementScale\name + BEQ lastElement\name + + + // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] + // Note: W^k is stored as negative values in the table and also + // need to conjugate the values from the table. + // + // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1) + + + SUB step,step,#24 +evenOddButterflyLoop\name : + + + ld1 {dW0r},[argTwiddle1],step1 + ld1 {dW1r},[argTwiddle1], #8 + + ld2 {dX0r,dX0i},[pSrc],step + SUB argTwiddle1,argTwiddle1,step1 + ld2 {dX1r,dX1i},[pSrc], #16 + + SUB step1,step1,#8 // (N/4-2)*8 bytes + ld1 {dW0i},[pTwiddleTmp],step1 + ld1 {dW1i},[pTwiddleTmp], #8 + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + rev64 dX1r,dX1r + rev64 dX1i,dX1i + SUBS size,size,#4 + + + fsub dT2,dX0r,dX1r // a-c + fadd dT3,dX0i,dX1i // b+d + fadd dT0,dX0r,dX1r // a+c + fsub dT1,dX0i,dX1i // b-d + SUB step1,step1,#8 + + fmul dT2, dT2, half[0] + fmul dT3, dT3, half[0] + + fmul dT0, dT0, half[0] + fmul dT1, dT1, half[0] + + // VZIP dW1r,dW1i + // VZIP dW0r,dW0i + zip1 dZip, dW1r,dW1i + zip2 dW1i,dW1r,dW1i + mov dW1r8b, dZip8b + zip1 dZip,dW0r,dW0i + zip2 dW0i,dW0r,dW0i + mov dW0r8b, dZip8b + + fmul dX1r,dW1r,dT2 + fmul dX1i,dW1r,dT3 + fmul dX0r,dW0r,dT2 + fmul dX0i,dW0r,dT3 + + fmls dX1r,dW1i,dT3 + fmla dX1i,dW1i,dT2 + + fmla dX0r,dW0i,dT3 + fmls dX0i,dW0i,dT2 + + + fadd dY1r,dT0,dX1i // F(N/2 -1) + fsub dY1i,dX1r,dT1 + + rev64 dY1r,dY1r + rev64 dY1i,dY1i + + + fadd dY0r,dT0,dX0i // F(1) + fsub dY0i,dT1,dX0r + + + st2 {dY0r,dY0i},[pOut1],step + st2 {dY1r,dY1i},[pOut1], #16 + SUB pOut1,pOut1,step + SUB step,step,#32 // (N/2-4)*8 bytes + + + BGT evenOddButterflyLoop\name + + + // set both the ptrs to the last element + SUB pSrc,pSrc,#8 + SUB pOut1,pOut1,#8 + + // Last element can be expanded as follows + // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as + // -ve) + // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] + // 1/2[2a+j0] - j (c-jd) [0+j2b] + // (a+bc, -bd) + // Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement\name : + ld1 {dX0r},[pSrc] + + st1 {dX0rs}[0],[pOut1], #4 + fneg dX0r,dX0r + st1 {dX0rs}[1],[pOut1] + + + +decrementScale\name : + + .endm + + M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15 + FFTSTAGE "FALSE","TRUE",Inv + M_END + + .end |