summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S')
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S371
1 files changed, 371 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 00000000000..2fc2e604f78
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+ //IMPORT armAAC_constTable
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define grpCount x9
+#define dstStep x10
+#define grpTwStep x13
+#define stepTwiddle x14
+#define twStep x15
+#define step16 x11
+#define step24 x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02 v0.2s
+#define dButterfly1Real028b v0.8b
+#define dButterfly1Imag02 v1.2s
+#define dButterfly1Imag028b v1.8b
+#define dButterfly1Real13 v2.2s
+#define dButterfly1Real138b v2.8b
+#define dButterfly1Imag13 v3.2s
+#define dButterfly1Imag138b v3.8b
+#define dButterfly2Real02 v4.2s
+#define dButterfly2Imag02 v5.2s
+#define dButterfly2Real13 v6.2s
+#define dButterfly2Imag13 v7.2s
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr08b v0.8b
+#define dXi08b v1.8b
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr1 v18.2s
+#define dYi1 v19.2s
+#define dYr2 v20.2s
+#define dYi2 v21.2s
+#define dYr3 v22.2s
+#define dYi3 v23.2s
+
+#define dW1r v8.2s
+#define dW1i v9.2s
+#define dW2r v10.2s
+#define dW2r8b v10.8b
+#define dW2i v11.2s
+#define dW3r v12.2s
+#define dW3r8b v12.8b
+#define dW3i v13.2s
+
+#define dZr0 v14.2s
+#define dZi0 v15.2s
+#define dZr08b v14.8b
+#define dZi08b v15.8b
+#define dZr1 v26.2s
+#define dZi1 v27.2s
+#define dZr2 v28.2s
+#define dZi2 v29.2s
+#define dZr3 v30.2s
+#define dZi3 v31.2s
+
+#define dZip v24.2s
+#define dZip8b v24.8b
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes
+ lsl outPointStep,subFFTSize, #3
+
+ // Update grpCount and grpSize rightaway
+
+ ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr]
+ MOV step16,#16
+ LSL grpCount,subFFTSize,#2
+
+ ld1 {dW2r},[pTwiddle] // [wi|wr]
+ MOV subFFTNum,#1 //after the last stage
+
+ ld1 {dW3r},[pTwiddle],step16 // [wi|wr]
+ MOV stepTwiddle,#0
+
+ ld1 {dW2i},[pTwiddle],#8 // [wi|wr]
+ SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+ lsl dstStep,outPointStep, #1
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+ ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
+
+ rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16
+ MOV step24,#24
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+ // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+ // VZIP dW2r,dW2i
+ zip1 dZip, dW2r, dW2i
+ zip2 dW2i, dW2r, dW2i
+ mov dW2r8b, dZip8b
+
+ ADD stepTwiddle,stepTwiddle,#16
+
+ // VZIP dW3r,dW3i
+ zip1 dZip, dW3r,dW3i
+ zip2 dW3i, dW3r, dW3i
+ mov dW3r8b, dZip8b
+ ADD grpTwStep,stepTwiddle,#4
+
+ // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ mov dButterfly1Real138b, dZip8b
+
+ SUB twStep,stepTwiddle,#16 // -16+stepTwiddle
+
+ // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ mov dButterfly1Imag138b, dZip8b
+ lsl grpTwStep,grpTwStep,#1
+
+ // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ mov dButterfly1Real028b, dZip8b
+ rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle
+
+ // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ mov dButterfly1Imag028b, dZip8b
+
+
+ // grpCount is multiplied by 4
+ SUBS grpCount,grpCount,#8
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr1,dW1r,dXr1
+ fmla dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmls dZi1,dW1i,dXr1 // imag part
+
+ .else
+
+ fmul dZr1,dW1r,dXr1
+ fmls dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmla dZi1,dW1i,dXr1 // imag part
+
+ .endif
+
+ ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr]
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr2,dW2r,dXr2
+ fmla dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmls dZi2,dW2i,dXr2 // imag part
+
+ .else
+
+ fmul dZr2,dW2r,dXr2
+ fmls dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmla dZi2,dW2i,dXr2 // imag part
+
+ .endif
+
+
+ ld1 {dW2i},[pTwiddle],twStep // [wi|wr]
+
+ // move qX0 so as to load for the next iteration
+ // MOV qZ0,qX0
+ mov dZr08b, dXr08b
+ mov dZi08b, dXi08b
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr3,dW3r,dXr3
+ fmla dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmls dZi3,dW3i,dXr3 // imag part
+
+ .else
+
+ fmul dZr3,dW3r,dXr3
+ fmls dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmla dZi3,dW3i,dXr3 // imag part
+
+ .endif
+
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+
+ // Don't do the load on the last iteration so we don't read past the end
+ // of pSrc.
+ bne skipIncrement\name
+ add pSrc, pSrc, #64
+skipIncrement\name:
+ beq radix4lsSkipRead\name
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+ // finish first stage of 4 point FFT
+
+ // fadd qY0,qZ0,qZ2
+ fadd dYr0,dZr0,dZr2
+ fadd dYi0,dZi0,dZi2
+ // fsub qY2,qZ0,qZ2
+ fsub dYr2,dZr0,dZr2
+ fsub dYi2,dZi0,dZi2
+ // fadd qY1,qZ1,qZ3
+ fadd dYr1,dZr1,dZr3
+ fadd dYi1,dZi1,dZi3
+ // fsub qY3,qZ1,qZ3
+ fsub dYr3,dZr1,dZr3
+ fsub dYi3,dZi1,dZi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr1,dZi1},[pDst],dstStep
+
+
+ .else
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr1,dZi1},[pDst],outPointStep
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr3,dZi3},[pDst],dstStep
+
+
+ .endif
+
+ BGT radix4lsGrpLoop\name
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end