summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S')
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S473
1 files changed, 473 insertions, 0 deletions
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 00000000000..f348e6a975c
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step2 x11
+#define t0 w12
+
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dXr4 v8.2s
+#define dXi4 v9.2s
+#define dXr5 v10.2s
+#define dXi5 v11.2s
+#define dXr6 v12.2s
+#define dXi6 v13.2s
+#define dXr7 v14.2s
+#define dXi7 v15.2s
+#define qX0 v0.4s
+#define qX1 v1.4s
+#define qX2 v2.4s
+#define qX3 v3.4s
+#define qX4 v4.4s
+#define qX5 v5.4s
+#define qX6 v6.4s
+#define qX7 v7.4s
+
+#define dUr0 v16.2s
+#define dUi0 v17.2s
+#define dUr2 v18.2s
+#define dUi2 v19.2s
+#define dUr4 v20.2s
+#define dUi4 v21.2s
+#define dUr6 v22.2s
+#define dUi6 v23.2s
+#define dUr1 v24.2s
+#define dUi1 v25.2s
+#define dUr3 v26.2s
+#define dUi3 v27.2s
+#define dUr5 v28.2s
+#define dUi5 v29.2s
+// reuse dXr7 and dXi7
+#define dUr7 v30.2s
+#define dUi7 v31.2s
+#define qU0 v8.4s
+#define qU1 v12.4s
+#define qU2 v9.4s
+#define qU3 v13.4s
+#define qU4 v10.4s
+#define qU5 v14.4s
+#define qU6 v11.4s
+#define qU7 v15.4s
+
+
+#define dVr0 v24.2s
+#define dVi0 v25.2s
+#define dVr2 v26.2s
+#define dVi2 v27.2s
+#define dVr4 v28.2s
+#define dVi4 v29.2s
+#define dVr6 v30.2s
+#define dVi6 v31.2s
+#define dVr1 v16.2s
+#define dVi1 v17.2s
+#define dVr3 v18.2s
+#define dVi3 v19.2s
+#define dVr5 v20.2s
+#define dVi5 v21.2s
+#define dVr7 v22.2s
+#define dVi7 v23.2s
+#define qV0 v12.4s
+#define qV1 v8.4s
+#define qV2 v13.4s
+#define qV3 v9.4s
+#define qV4 v14.4s
+#define qV5 v10.4s
+#define qV6 v15.4s
+#define qV7 v11.4s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr2 v18.2s
+#define dYi2 v19.2s
+#define dYr4 v20.2s
+#define dYi4 v21.2s
+#define dYr6 v22.2s
+#define dYi6 v23.2s
+#define dYr1 v24.2s
+#define dYi1 v25.2s
+#define dYr3 v26.2s
+#define dYi3 v27.2s
+#define dYr5 v28.2s
+#define dYi5 v29.2s
+#define dYr7 v30.2s
+#define dYi7 v31.2s
+#define qY0 v8.4s
+#define qY1 v12.4s
+#define qY2 v9.4s
+#define qY3 v13.4s
+#define qY4 v10.4s
+#define qY5 v14.4s
+#define qY6 v11.4s
+#define qY7 v15.4s
+
+#define dT0 v14.2s
+#define dT0s v14.s
+#define dT1 v15.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ // subFFTSize = 1 for the first stage
+
+ movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2).
+ movk t0, 0x04f3 // Low half word of sqrt(1/2).
+ MOV subFFTSize,#8
+
+ // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#3
+ MOV subFFTNum,grpSize
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep,grpSize, #3
+
+
+ // Calculate the step of input data for the next set
+ //MOV step1,pointStep,LSL #1 // step1 = 2*pointStep
+ ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
+ lsl step1,grpSize, #4
+ lsl step2,pointStep, #3
+
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ SUB step2,step2,pointStep // step2 = 7*pointStep
+ // setStep = - 7*pointStep+16
+ rsb setStep,step2,#16
+
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+ // data[7] & update pSrc for the next set
+ // setStep = -7*pointStep + 16
+ ld2 {dXr7,dXi7},[pSrc],setStep
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 8 point FFT
+
+ // fadd qU0,qX0,qX4
+ // fadd qU2,qX1,qX5
+ // fadd qU4,qX2,qX6
+ // fadd qU6,qX3,qX7
+ fadd dUr0,dXr0,dXr4
+ fadd dUr2,dXr1,dXr5
+ fadd dUr4,dXr2,dXr6
+ fadd dUr6,dXr3,dXr7
+ fadd dUi0,dXi0,dXi4
+ fadd dUi2,dXi1,dXi5
+ fadd dUi4,dXi2,dXi6
+ fadd dUi6,dXi3,dXi7
+
+ // finish second stage of 8 point FFT
+
+ // fadd qV0,qU0,qU4
+ // fsub qV2,qU0,qU4
+ // fadd qV4,qU2,qU6
+ // fsub qV6,qU2,qU6
+ fadd dVr0,dUr0,dUr4
+ fsub dVr2,dUr0,dUr4
+ fadd dVr4,dUr2,dUr6
+ fsub dVr6,dUr2,dUr6
+ fadd dVi0,dUi0,dUi4
+ fsub dVi2,dUi0,dUi4
+ fadd dVi4,dUi2,dUi6
+ fsub dVi6,dUi2,dUi6
+
+ // finish third stage of 8 point FFT
+
+ // fadd qY0,qV0,qV4
+ // fsub qY4,qV0,qV4
+ fadd dYr0,dVr0,dVr4
+ fsub dYr4,dVr0,dVr4
+ fadd dYi0,dVi0,dVi4
+ fsub dYi4,dVi0,dVi4
+
+ st2 {dYr0,dYi0},[pDst],step1 // store y0
+
+ .ifeqs "\inverse", "TRUE"
+
+ fsub dYr2,dVr2,dVi6
+ fadd dYi2,dVi2,dVr6
+
+ fadd dYr6,dVr2,dVi6
+ st2 {dYr2,dYi2},[pDst],step1 // store y2
+ fsub dYi6,dVi2,dVr6
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr6,dYi6},[pDst],step1 // store y6
+
+ .ELSE
+
+ fadd dYr6,dVr2,dVi6
+ fsub dYi6,dVi2,dVr6
+
+ fsub dYr2,dVr2,dVi6
+ st2 {dYr6,dYi6},[pDst],step1 // store y2
+ fadd dYi2,dVi2,dVr6
+
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr2,dYi2},[pDst],step1 // store y6
+
+
+ .ENDIF
+
+ // finish first stage of 8 point FFT
+
+ // fsub qU7,qX3,qX7
+ fsub dUr7,dXr3,dXr7
+ fsub dUi7,dXi3,dXi7
+
+ mov dT0s[0], t0
+
+ // finish second stage of 8 point FFT
+
+ fsub dVr1,dUr1,dUi5
+ // data[0] for next iteration
+ ld2 {dXr0,dXi0},[pSrc],pointStep
+ fadd dVi1,dUi1,dUr5
+ fadd dVr3,dUr1,dUi5
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ fsub dVi3,dUi1,dUr5
+
+ fsub dVr5,dUr3,dUi7
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ fadd dVi5,dUi3,dUr7
+ fadd dVr7,dUr3,dUi7
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ fsub dVi7,dUi3,dUr7
+
+ // finish third stage of 8 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi5,dVi5,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ fmul dVi7,dVi7,dT0[0]
+
+ // fadd qY1,qV1,qV5
+ // fsub qY5,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fsub dYr5,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+ fsub dYi5,dVi1,dVi5
+
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateInv\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateInv\name:
+
+ fsub dYr3,dVr3,dVr7
+ fsub dYi3,dVi3,dVi7
+ st2 {dYr1,dYi1},[pDst],step1 // store y1
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+
+
+ st2 {dYr3,dYi3},[pDst],step1 // store y3
+ st2 {dYr5,dYi5},[pDst],step1 // store y5
+ st2 {dYr7,dYi7},[pDst] // store y7
+ ADD pDst, pDst, #16
+
+ .ELSE
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi7,dVi7,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+ fmul dVi5,dVi5,dT0[0]
+
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateFwd\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateFwd\name:
+
+ // fsub qY5,qV1,qV5
+ fsub dYr5,dVr1,dVr5
+ fsub dYi5,dVi1,dVi5
+
+ fsub dYr3,dVr3,dVr7
+ st2 {dYr7,dYi7},[pDst],step1 // store y1
+ fsub dYi3,dVi3,dVi7
+
+ // fadd qY1,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+
+ st2 {dYr5,dYi5},[pDst],step1 // store y3
+ st2 {dYr3,dYi3},[pDst],step1 // store y5
+ st2 {dYr1,dYi1},[pDst],#16 // store y7
+
+ .ENDIF
+
+
+ // update pDst for the next set
+ SUB pDst, pDst, step2
+ BGT radix8fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+ // Allocate stack memory required by the function
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+
+ .end