diff options
Diffstat (limited to 'src/3rdparty/pffft/test_pffft.c')
-rw-r--r-- | src/3rdparty/pffft/test_pffft.c | 463 |
1 files changed, 463 insertions, 0 deletions
diff --git a/src/3rdparty/pffft/test_pffft.c b/src/3rdparty/pffft/test_pffft.c new file mode 100644 index 000000000..b217b6bec --- /dev/null +++ b/src/3rdparty/pffft/test_pffft.c @@ -0,0 +1,463 @@ +/* + Copyright (c) 2013 Julien Pommier. + + Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, Intel MKL, and Apple vDSP + + How to build: + + on linux, with fftw3: + gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + + on macos, without fftw3: + clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate + + on macos, with fftw3: + clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate + + on macos, with fftw3 and Intel MKL: + clang -o test_pffft -I /opt/intel/mkl/include -DHAVE_FFTW -DHAVE_VECLIB -DHAVE_MKL -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate /opt/intel/mkl/lib/libmkl_{intel_lp64,sequential,core}.a + + on windows, with visual c++: + cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c + + build without SIMD instructions: + gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm + + */ + +#include "pffft.h" +#include "fftpack.h" + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <assert.h> +#include <string.h> + +#ifdef HAVE_SYS_TIMES +# include <sys/times.h> +# include <unistd.h> +#endif + +#ifdef HAVE_VECLIB +# include <Accelerate/Accelerate.h> +#endif + +#ifdef HAVE_FFTW +# include <fftw3.h> +#endif + +#ifdef HAVE_MKL +# include <mkl_dfti.h> +#endif + +#define MAX_OF(x,y) ((x)>(y)?(x):(y)) + +double frand() { + return rand()/(double)RAND_MAX; +} + +#if defined(HAVE_SYS_TIMES) + inline double uclock_sec(void) { + static double ttclk = 0.; + if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK); + struct tms t; return ((double)times(&t)) / ttclk; + } +# else + double uclock_sec(void) +{ return (double)clock()/(double)CLOCKS_PER_SEC; } +#endif + + +/* compare results with the regular fftpack */ +void pffft_validate_N(int N, int cplx) { + int Nfloat = N*(cplx?2:1); + int Nbytes = Nfloat * sizeof(float); + float *ref, *in, *out, *tmp, *tmp2; + PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + int pass; + + if (!s) { printf("Skipping N=%d, not supported\n", N); return; } + ref = pffft_aligned_malloc(Nbytes); + in = pffft_aligned_malloc(Nbytes); + out = pffft_aligned_malloc(Nbytes); + tmp = pffft_aligned_malloc(Nbytes); + tmp2 = pffft_aligned_malloc(Nbytes); + + for (pass=0; pass < 2; ++pass) { + float ref_max = 0; + int k; + //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); + // compute reference solution with FFTPACK + if (pass == 0) { + float *wrk = malloc(2*Nbytes+15*sizeof(float)); + for (k=0; k < Nfloat; ++k) { + ref[k] = in[k] = frand()*2-1; + out[k] = 1e30; + } + if (!cplx) { + rffti(N, wrk); + rfftf(N, ref, wrk); + // use our ordering for real ffts instead of the one of fftpack + { + float refN=ref[N-1]; + for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; + ref[1] = refN; + } + } else { + cffti(N, wrk); + cfftf(N, ref, wrk); + } + free(wrk); + } + + for (k = 0; k < Nfloat; ++k) ref_max = MAX_OF(ref_max, fabs(ref[k])); + + + // pass 0 : non canonical ordering of transform coefficients + if (pass == 0) { + // test forward transform, with different input / output + pffft_transform(s, in, tmp, 0, PFFFT_FORWARD); + memcpy(tmp2, tmp, Nbytes); + memcpy(tmp, in, Nbytes); + pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + + // test reordering + pffft_zreorder(s, tmp, out, PFFFT_FORWARD); + pffft_zreorder(s, out, tmp, PFFFT_BACKWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + pffft_zreorder(s, tmp, out, PFFFT_FORWARD); + } else { + // pass 1 : canonical ordering of transform coeffs. + pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD); + memcpy(tmp2, tmp, Nbytes); + memcpy(tmp, in, Nbytes); + pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == tmp[k]); + } + memcpy(out, tmp, Nbytes); + } + + { + for (k=0; k < Nfloat; ++k) { + if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) { + printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N); + exit(1); + } + } + + if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD); + else pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD); + memcpy(tmp2, out, Nbytes); + memcpy(out, tmp, Nbytes); + if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD); + else pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD); + for (k = 0; k < Nfloat; ++k) { + assert(tmp2[k] == out[k]); + out[k] *= 1.f/N; + } + for (k = 0; k < Nfloat; ++k) { + if (fabs(in[k] - out[k]) > 1e-3 * ref_max) { + printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break; + exit(1); + } + } + } + + // quick test of the circular convolution in fft domain + { + float conv_err = 0, conv_max = 0; + + pffft_zreorder(s, ref, tmp, PFFFT_FORWARD); + memset(out, 0, Nbytes); + pffft_zconvolve_accumulate(s, ref, ref, out, 1.0); + pffft_zreorder(s, out, tmp2, PFFFT_FORWARD); + + for (k=0; k < Nfloat; k += 2) { + float ar = tmp[k], ai=tmp[k+1]; + if (cplx || k > 0) { + tmp[k] = ar*ar - ai*ai; + tmp[k+1] = 2*ar*ai; + } else { + tmp[0] = ar*ar; + tmp[1] = ai*ai; + } + } + + for (k=0; k < Nfloat; ++k) { + float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]); + if (d > conv_err) conv_err = d; + if (e > conv_max) conv_max = e; + } + if (conv_err > 1e-5*conv_max) { + printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1); + } + } + + } + + printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout); + + pffft_destroy_setup(s); + pffft_aligned_free(ref); + pffft_aligned_free(in); + pffft_aligned_free(out); + pffft_aligned_free(tmp); + pffft_aligned_free(tmp2); +} + +void pffft_validate(int cplx) { + static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0}; + int k; + for (k = 0; Ntest[k]; ++k) { + int N = Ntest[k]; + if (N == 16 && !cplx) continue; + pffft_validate_N(N, cplx); + } +} + +int array_output_format = 1; + +void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) { + float mflops = flops/1e6/(t1 - t0 + 1e-16); + if (array_output_format) { + if (flops != -1) { + printf("|%9.0f ", mflops); + } else printf("| n/a "); + } else { + if (flops != -1) { + printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter); + } + } + fflush(stdout); +} + +void benchmark_ffts(int N, int cplx) { + int Nfloat = (cplx ? N*2 : N); + int Nbytes = Nfloat * sizeof(float); + float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes); + + double t0, t1, flops; + + int k; + int max_iter = 5120000/N*4; +#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) + max_iter /= 8; +#endif + int iter; + + for (k = 0; k < Nfloat; ++k) { + X[k] = 0; //sqrtf(k+1); + } + + // FFTPack benchmark + { + float *wrk = malloc(2*Nbytes + 15*sizeof(float)); + int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1; + if (cplx) cffti(N, wrk); + else rffti(N, wrk); + t0 = uclock_sec(); + + for (iter = 0; iter < max_iter_; ++iter) { + if (cplx) { + cfftf(N, X, wrk); + cfftb(N, X, wrk); + } else { + rfftf(N, X, wrk); + rfftb(N, X, wrk); + } + } + t1 = uclock_sec(); + free(wrk); + + flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html + show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_); + } + +#ifdef HAVE_VECLIB + int log2N = (int)(log(N)/log(2) + 0.5f); + if (N == (1<<log2N)) { + FFTSetup setup; + + setup = vDSP_create_fftsetup(log2N, FFT_RADIX2); + DSPSplitComplex zsamples; + zsamples.realp = &X[0]; + zsamples.imagp = &X[Nfloat/2]; + t0 = uclock_sec(); + for (iter = 0; iter < max_iter; ++iter) { + if (cplx) { + vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); + vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse); + } else { + vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); + vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse); + } + } + t1 = uclock_sec(); + vDSP_destroy_fftsetup(setup); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html + show_output("vDSP", N, cplx, flops, t0, t1, max_iter); + } else { + show_output("vDSP", N, cplx, -1, -1, -1, -1); + } +#endif + +#ifdef HAVE_MKL + { + DFTI_DESCRIPTOR_HANDLE fft_handle; + if (DftiCreateDescriptor(&fft_handle, DFTI_SINGLE, (cplx ? DFTI_COMPLEX : DFTI_REAL), 1, N) == 0) { + assert(DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_INPLACE) == 0); + assert(DftiCommitDescriptor(fft_handle) == 0); + + t0 = uclock_sec(); + for (iter = 0; iter < max_iter; ++iter) { + DftiComputeForward(fft_handle, &X[0]); + DftiComputeBackward(fft_handle, &X[0]); + } + t1 = uclock_sec(); + + + DftiFreeDescriptor(&fft_handle); + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html + show_output("MKL ", N, cplx, flops, t0, t1, max_iter); + } else { + show_output(" MKL", N, cplx, -1, -1, -1, -1); + } + } +#endif + +#ifdef HAVE_FFTW + { + fftwf_plan planf, planb; + fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N); + fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N); + memset(in, 0, sizeof(fftw_complex) * N); + int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE); // measure takes a lot of time on largest ffts + //int flags = FFTW_ESTIMATE; + if (cplx) { + planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags); + planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags); + } else { + planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags); + planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags); + } + + t0 = uclock_sec(); + for (iter = 0; iter < max_iter; ++iter) { + fftwf_execute(planf); + fftwf_execute(planb); + } + t1 = uclock_sec(); + + fftwf_destroy_plan(planf); + fftwf_destroy_plan(planb); + fftwf_free(in); fftwf_free(out); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html + show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter); + } +#endif + + // PFFFT benchmark + { + PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); + if (s) { + t0 = uclock_sec(); + for (iter = 0; iter < max_iter; ++iter) { + pffft_transform(s, X, Z, Y, PFFFT_FORWARD); + pffft_transform(s, X, Z, Y, PFFFT_BACKWARD); + } + t1 = uclock_sec(); + pffft_destroy_setup(s); + + flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html + show_output("PFFFT", N, cplx, flops, t0, t1, max_iter); + } + } + + if (!array_output_format) { + printf("--\n"); + } + + pffft_aligned_free(X); + pffft_aligned_free(Y); + pffft_aligned_free(Z); +} + +#ifndef PFFFT_SIMD_DISABLE +void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction +#endif + +int main(int argc, char **argv) { + int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 }; + int i; + + if (argc > 1 && strcmp(argv[1], "--no-array-format") == 0) { + array_output_format = 0; + } + +#ifndef PFFFT_SIMD_DISABLE + validate_pffft_simd(); +#endif + pffft_validate(1); + pffft_validate(0); + if (!array_output_format) { + // display a nice markdown array + for (i=0; Nvalues[i] > 0; ++i) { + benchmark_ffts(Nvalues[i], 0 /* real fft */); + } + for (i=0; Nvalues[i] > 0; ++i) { + benchmark_ffts(Nvalues[i], 1 /* cplx fft */); + } + } else { + int columns = 0; + printf("| input len "); ++columns; + printf("|real FFTPack"); ++columns; +#ifdef HAVE_VECLIB + printf("| real vDSP "); ++columns; +#endif +#ifdef HAVE_MKL + printf("| real MKL "); ++columns; +#endif +#ifdef HAVE_FFTW + printf("| real FFTW "); ++columns; +#endif + printf("| real PFFFT "); ++columns; + + printf("|cplx FFTPack"); ++columns; +#ifdef HAVE_VECLIB + printf("| cplx vDSP "); ++columns; +#endif +#ifdef HAVE_MKL + printf("| cplx MKL "); ++columns; +#endif +#ifdef HAVE_FFTW + printf("| cplx FFTW "); ++columns; +#endif + printf("| cplx PFFFT |\n"); ++columns; + for (i=0; i < columns; ++i) { + printf("|-----------:"); + } + printf("|\n"); + + for (i=0; Nvalues[i] > 0; ++i) { + printf("|%9d ", Nvalues[i]); + benchmark_ffts(Nvalues[i], 0); + //printf("| "); + benchmark_ffts(Nvalues[i], 1); + printf("|\n"); + } + printf(" (numbers are given in MFlops)\n"); + } + + + return 0; +} |