1 files changed, 280 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 00000000000..da68314be82
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of
+//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+//  instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+      // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pTwiddle        x1
+#define	pOut		x2	
+#define	subFFTNum	x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+
+#define pOut1           x13
+	
+#define size            x7
+#define step            x8
+#define step1           x9
+#define twStep          x10
+#define pTwiddleTmp     x11
+#define argTwiddle1     x12
+
+// Neon registers
+
+#define dX0     v0.2s
+#define dX0s    v0.s
+#define dShift  v1.2s
+#define dX1     v1.2s
+#define dX1s    v1.s
+#define dY0     v2.2s
+#define dY08b   v2.8b
+#define dY1     v3.2s
+#define dX0r    v0.2s
+#define dX0rs   v0.s
+#define dX0i    v1.2s
+#define dX1r    v2.2s
+#define dX1i    v3.2s
+#define dW0r    v4.2s
+#define dW0r8b  v4.8b
+#define dW0i    v5.2s
+#define dW1r    v6.2s
+#define dW1r8b  v6.8b
+#define dW1i    v7.2s
+#define dT0     v8.2s
+#define dT1     v9.2s
+#define dT2     v10.2s
+#define dT3     v11.2s
+#define qT0     v12.2s
+#define qT1     v14.2s
+#define qT2     v16.2s
+#define qT3     v18.2s
+#define dY0r    v4.2s
+#define dY0i    v5.2s
+#define dY1r    v6.2s
+#define dY1i    v7.2s
+
+#define dY2     v4.2s
+#define dY3     v5.2s
+#define dW0     v6.2s
+#define dW1     v7.2s
+#define dW0Tmp  v10.2s
+#define dW1Neg  v11.2s
+
+#define dZip    v19.2s
+#define dZip8b  v19.8b
+#define half    v13.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        fmov    half, 0.5
+
+        asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
+        lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
+
+
+        // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        // Note: W^(k) is stored as negated value and also need to
+        // conjugate the values from the table
+
+        // Z(0) : no need of twiddle multiply
+        // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        ld1     {dX0},[pSrc],step
+        ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
+
+        ld1     {dX1},[pSrc], #8
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                // (N/4-1)*8 bytes
+
+        fadd    dY0,dX0,dX1                   // [b+d | a+c]
+        fsub    dY1,dX0,dX1                   // [b-d | a-c]
+        fmul    dY0, dY0, half[0]
+        fmul    dY1, dY1, half[0]
+
+        // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        // VZIP    dY0,dY1
+        zip1    dZip,dY0,dY1
+        zip2    dY1,dY0,dY1
+        mov     dY08b, dZip8b
+
+        fsub   dX0,dY0,dY1
+        SUBS   size,size,#2
+        fadd   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        st1     {dX0s}[0],[pOut1], #4
+        ADD     pTwiddleTmp,pTwiddle,#8       // W^2
+        st1     {dX1s}[1],[pOut1], #4
+        ADD     argTwiddle1,pTwiddle,twStep   // W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        // Note: W^k is stored as negative values in the table and also
+        // need to conjugate the values from the table.
+        //
+        // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        ld1     {dW0r},[argTwiddle1],step1
+        ld1     {dW1r},[argTwiddle1], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+        SUB     step1,step1,#8                // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        fsub    dT2,dX0r,dX1r                 // a-c
+        fadd    dT3,dX0i,dX1i                 // b+d
+        fadd    dT0,dX0r,dX1r                 // a+c
+        fsub    dT1,dX0i,dX1i                 // b-d
+        SUB     step1,step1,#8
+
+        fmul    dT2, dT2, half[0]
+        fmul    dT3, dT3, half[0]
+
+        fmul    dT0, dT0, half[0]
+        fmul    dT1, dT1, half[0]
+
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r,dW1i
+        zip2    dW1i,dW1r,dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip,dW0r,dW0i
+        zip2    dW0i,dW0r,dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   dX1r,dW1r,dT2
+        fmul   dX1i,dW1r,dT3
+        fmul   dX0r,dW0r,dT2
+        fmul   dX0i,dW0r,dT3
+
+        fmls   dX1r,dW1i,dT3
+        fmla   dX1i,dW1i,dT2
+
+        fmla   dX0r,dW0i,dT3
+        fmls   dX0i,dW0i,dT2
+
+
+        fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
+        fsub    dY1i,dX1r,dT1
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fadd    dY0r,dT0,dX0i                 // F(1)
+        fsub    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[pOut1],step
+        st2     {dY1r,dY1i},[pOut1], #16
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        // -ve)
+        // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j (c-jd) [0+j2b]
+        // (a+bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[pOut1], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end