summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pffft/test_pffft.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pffft/test_pffft.c')
-rw-r--r--src/3rdparty/pffft/test_pffft.c463
1 files changed, 463 insertions, 0 deletions
diff --git a/src/3rdparty/pffft/test_pffft.c b/src/3rdparty/pffft/test_pffft.c
new file mode 100644
index 000000000..b217b6bec
--- /dev/null
+++ b/src/3rdparty/pffft/test_pffft.c
@@ -0,0 +1,463 @@
+/*
+ Copyright (c) 2013 Julien Pommier.
+
+ Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, Intel MKL, and Apple vDSP
+
+ How to build:
+
+ on linux, with fftw3:
+ gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+ on macos, without fftw3:
+ clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+ on macos, with fftw3:
+ clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+ on macos, with fftw3 and Intel MKL:
+ clang -o test_pffft -I /opt/intel/mkl/include -DHAVE_FFTW -DHAVE_VECLIB -DHAVE_MKL -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate /opt/intel/mkl/lib/libmkl_{intel_lp64,sequential,core}.a
+
+ on windows, with visual c++:
+ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+ build without SIMD instructions:
+ gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#include "pffft.h"
+#include "fftpack.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+# include <sys/times.h>
+# include <unistd.h>
+#endif
+
+#ifdef HAVE_VECLIB
+# include <Accelerate/Accelerate.h>
+#endif
+
+#ifdef HAVE_FFTW
+# include <fftw3.h>
+#endif
+
+#ifdef HAVE_MKL
+# include <mkl_dfti.h>
+#endif
+
+#define MAX_OF(x,y) ((x)>(y)?(x):(y))
+
+double frand() {
+ return rand()/(double)RAND_MAX;
+}
+
+#if defined(HAVE_SYS_TIMES)
+ inline double uclock_sec(void) {
+ static double ttclk = 0.;
+ if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
+ struct tms t; return ((double)times(&t)) / ttclk;
+ }
+# else
+ double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+/* compare results with the regular fftpack */
+void pffft_validate_N(int N, int cplx) {
+ int Nfloat = N*(cplx?2:1);
+ int Nbytes = Nfloat * sizeof(float);
+ float *ref, *in, *out, *tmp, *tmp2;
+ PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+ int pass;
+
+ if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
+ ref = pffft_aligned_malloc(Nbytes);
+ in = pffft_aligned_malloc(Nbytes);
+ out = pffft_aligned_malloc(Nbytes);
+ tmp = pffft_aligned_malloc(Nbytes);
+ tmp2 = pffft_aligned_malloc(Nbytes);
+
+ for (pass=0; pass < 2; ++pass) {
+ float ref_max = 0;
+ int k;
+ //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
+ // compute reference solution with FFTPACK
+ if (pass == 0) {
+ float *wrk = malloc(2*Nbytes+15*sizeof(float));
+ for (k=0; k < Nfloat; ++k) {
+ ref[k] = in[k] = frand()*2-1;
+ out[k] = 1e30;
+ }
+ if (!cplx) {
+ rffti(N, wrk);
+ rfftf(N, ref, wrk);
+ // use our ordering for real ffts instead of the one of fftpack
+ {
+ float refN=ref[N-1];
+ for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
+ ref[1] = refN;
+ }
+ } else {
+ cffti(N, wrk);
+ cfftf(N, ref, wrk);
+ }
+ free(wrk);
+ }
+
+ for (k = 0; k < Nfloat; ++k) ref_max = MAX_OF(ref_max, fabs(ref[k]));
+
+
+ // pass 0 : non canonical ordering of transform coefficients
+ if (pass == 0) {
+ // test forward transform, with different input / output
+ pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
+ memcpy(tmp2, tmp, Nbytes);
+ memcpy(tmp, in, Nbytes);
+ pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
+ for (k = 0; k < Nfloat; ++k) {
+ assert(tmp2[k] == tmp[k]);
+ }
+
+ // test reordering
+ pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+ pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
+ for (k = 0; k < Nfloat; ++k) {
+ assert(tmp2[k] == tmp[k]);
+ }
+ pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+ } else {
+ // pass 1 : canonical ordering of transform coeffs.
+ pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
+ memcpy(tmp2, tmp, Nbytes);
+ memcpy(tmp, in, Nbytes);
+ pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
+ for (k = 0; k < Nfloat; ++k) {
+ assert(tmp2[k] == tmp[k]);
+ }
+ memcpy(out, tmp, Nbytes);
+ }
+
+ {
+ for (k=0; k < Nfloat; ++k) {
+ if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
+ printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
+ exit(1);
+ }
+ }
+
+ if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
+ else pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
+ memcpy(tmp2, out, Nbytes);
+ memcpy(out, tmp, Nbytes);
+ if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
+ else pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
+ for (k = 0; k < Nfloat; ++k) {
+ assert(tmp2[k] == out[k]);
+ out[k] *= 1.f/N;
+ }
+ for (k = 0; k < Nfloat; ++k) {
+ if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
+ printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
+ exit(1);
+ }
+ }
+ }
+
+ // quick test of the circular convolution in fft domain
+ {
+ float conv_err = 0, conv_max = 0;
+
+ pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
+ memset(out, 0, Nbytes);
+ pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
+ pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
+
+ for (k=0; k < Nfloat; k += 2) {
+ float ar = tmp[k], ai=tmp[k+1];
+ if (cplx || k > 0) {
+ tmp[k] = ar*ar - ai*ai;
+ tmp[k+1] = 2*ar*ai;
+ } else {
+ tmp[0] = ar*ar;
+ tmp[1] = ai*ai;
+ }
+ }
+
+ for (k=0; k < Nfloat; ++k) {
+ float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
+ if (d > conv_err) conv_err = d;
+ if (e > conv_max) conv_max = e;
+ }
+ if (conv_err > 1e-5*conv_max) {
+ printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
+ }
+ }
+
+ }
+
+ printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
+
+ pffft_destroy_setup(s);
+ pffft_aligned_free(ref);
+ pffft_aligned_free(in);
+ pffft_aligned_free(out);
+ pffft_aligned_free(tmp);
+ pffft_aligned_free(tmp2);
+}
+
+void pffft_validate(int cplx) {
+ static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
+ int k;
+ for (k = 0; Ntest[k]; ++k) {
+ int N = Ntest[k];
+ if (N == 16 && !cplx) continue;
+ pffft_validate_N(N, cplx);
+ }
+}
+
+int array_output_format = 1;
+
+void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
+ float mflops = flops/1e6/(t1 - t0 + 1e-16);
+ if (array_output_format) {
+ if (flops != -1) {
+ printf("|%9.0f ", mflops);
+ } else printf("| n/a ");
+ } else {
+ if (flops != -1) {
+ printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
+ }
+ }
+ fflush(stdout);
+}
+
+void benchmark_ffts(int N, int cplx) {
+ int Nfloat = (cplx ? N*2 : N);
+ int Nbytes = Nfloat * sizeof(float);
+ float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
+
+ double t0, t1, flops;
+
+ int k;
+ int max_iter = 5120000/N*4;
+#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
+ max_iter /= 8;
+#endif
+ int iter;
+
+ for (k = 0; k < Nfloat; ++k) {
+ X[k] = 0; //sqrtf(k+1);
+ }
+
+ // FFTPack benchmark
+ {
+ float *wrk = malloc(2*Nbytes + 15*sizeof(float));
+ int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
+ if (cplx) cffti(N, wrk);
+ else rffti(N, wrk);
+ t0 = uclock_sec();
+
+ for (iter = 0; iter < max_iter_; ++iter) {
+ if (cplx) {
+ cfftf(N, X, wrk);
+ cfftb(N, X, wrk);
+ } else {
+ rfftf(N, X, wrk);
+ rfftb(N, X, wrk);
+ }
+ }
+ t1 = uclock_sec();
+ free(wrk);
+
+ flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+ show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+ }
+
+#ifdef HAVE_VECLIB
+ int log2N = (int)(log(N)/log(2) + 0.5f);
+ if (N == (1<<log2N)) {
+ FFTSetup setup;
+
+ setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
+ DSPSplitComplex zsamples;
+ zsamples.realp = &X[0];
+ zsamples.imagp = &X[Nfloat/2];
+ t0 = uclock_sec();
+ for (iter = 0; iter < max_iter; ++iter) {
+ if (cplx) {
+ vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
+ vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+ } else {
+ vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
+ vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+ }
+ }
+ t1 = uclock_sec();
+ vDSP_destroy_fftsetup(setup);
+
+ flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+ show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
+ } else {
+ show_output("vDSP", N, cplx, -1, -1, -1, -1);
+ }
+#endif
+
+#ifdef HAVE_MKL
+ {
+ DFTI_DESCRIPTOR_HANDLE fft_handle;
+ if (DftiCreateDescriptor(&fft_handle, DFTI_SINGLE, (cplx ? DFTI_COMPLEX : DFTI_REAL), 1, N) == 0) {
+ assert(DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_INPLACE) == 0);
+ assert(DftiCommitDescriptor(fft_handle) == 0);
+
+ t0 = uclock_sec();
+ for (iter = 0; iter < max_iter; ++iter) {
+ DftiComputeForward(fft_handle, &X[0]);
+ DftiComputeBackward(fft_handle, &X[0]);
+ }
+ t1 = uclock_sec();
+
+
+ DftiFreeDescriptor(&fft_handle);
+ flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+ show_output("MKL ", N, cplx, flops, t0, t1, max_iter);
+ } else {
+ show_output(" MKL", N, cplx, -1, -1, -1, -1);
+ }
+ }
+#endif
+
+#ifdef HAVE_FFTW
+ {
+ fftwf_plan planf, planb;
+ fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+ fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+ memset(in, 0, sizeof(fftw_complex) * N);
+ int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE); // measure takes a lot of time on largest ffts
+ //int flags = FFTW_ESTIMATE;
+ if (cplx) {
+ planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
+ planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
+ } else {
+ planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
+ planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
+ }
+
+ t0 = uclock_sec();
+ for (iter = 0; iter < max_iter; ++iter) {
+ fftwf_execute(planf);
+ fftwf_execute(planb);
+ }
+ t1 = uclock_sec();
+
+ fftwf_destroy_plan(planf);
+ fftwf_destroy_plan(planb);
+ fftwf_free(in); fftwf_free(out);
+
+ flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+ show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
+ }
+#endif
+
+ // PFFFT benchmark
+ {
+ PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+ if (s) {
+ t0 = uclock_sec();
+ for (iter = 0; iter < max_iter; ++iter) {
+ pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
+ pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
+ }
+ t1 = uclock_sec();
+ pffft_destroy_setup(s);
+
+ flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+ show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+ }
+ }
+
+ if (!array_output_format) {
+ printf("--\n");
+ }
+
+ pffft_aligned_free(X);
+ pffft_aligned_free(Y);
+ pffft_aligned_free(Z);
+}
+
+#ifndef PFFFT_SIMD_DISABLE
+void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction
+#endif
+
+int main(int argc, char **argv) {
+ int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+ int i;
+
+ if (argc > 1 && strcmp(argv[1], "--no-array-format") == 0) {
+ array_output_format = 0;
+ }
+
+#ifndef PFFFT_SIMD_DISABLE
+ validate_pffft_simd();
+#endif
+ pffft_validate(1);
+ pffft_validate(0);
+ if (!array_output_format) {
+ // display a nice markdown array
+ for (i=0; Nvalues[i] > 0; ++i) {
+ benchmark_ffts(Nvalues[i], 0 /* real fft */);
+ }
+ for (i=0; Nvalues[i] > 0; ++i) {
+ benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
+ }
+ } else {
+ int columns = 0;
+ printf("| input len "); ++columns;
+ printf("|real FFTPack"); ++columns;
+#ifdef HAVE_VECLIB
+ printf("| real vDSP "); ++columns;
+#endif
+#ifdef HAVE_MKL
+ printf("| real MKL "); ++columns;
+#endif
+#ifdef HAVE_FFTW
+ printf("| real FFTW "); ++columns;
+#endif
+ printf("| real PFFFT "); ++columns;
+
+ printf("|cplx FFTPack"); ++columns;
+#ifdef HAVE_VECLIB
+ printf("| cplx vDSP "); ++columns;
+#endif
+#ifdef HAVE_MKL
+ printf("| cplx MKL "); ++columns;
+#endif
+#ifdef HAVE_FFTW
+ printf("| cplx FFTW "); ++columns;
+#endif
+ printf("| cplx PFFFT |\n"); ++columns;
+ for (i=0; i < columns; ++i) {
+ printf("|-----------:");
+ }
+ printf("|\n");
+
+ for (i=0; Nvalues[i] > 0; ++i) {
+ printf("|%9d ", Nvalues[i]);
+ benchmark_ffts(Nvalues[i], 0);
+ //printf("| ");
+ benchmark_ffts(Nvalues[i], 1);
+ printf("|\n");
+ }
+ printf(" (numbers are given in MFlops)\n");
+ }
+
+
+ return 0;
+}