diff options
Diffstat (limited to 'lib/ffts/src/ffts_real.c')
-rw-r--r-- | lib/ffts/src/ffts_real.c | 218 |
1 files changed, 183 insertions, 35 deletions
diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c index 0f87a12..e0f0e1f 100644 --- a/lib/ffts/src/ffts_real.c +++ b/lib/ffts/src/ffts_real.c @@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> Copyright (c) 2012, The University of Waikato -Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi> +Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi> All rights reserved. @@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real.h" +#include "ffts_cpu.h" #include "ffts_internal.h" #include "ffts_trig.h" @@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <pmmintrin.h> #elif HAVE_INTRIN_H #include <intrin.h> -#else +#endif + /* avoid using negative zero as some configurations have problems with those */ static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = { 0x80000000, 0, 0x80000000, 0 @@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = { 0, 0x80000000, 0, 0x80000000 }; #endif -#endif static void ffts_free_1d_real(ffts_plan_t *p) @@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p) free(p); } +#ifdef __ARM_NEON__ static void -ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) +ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output) { float *const FFTS_RESTRICT out = (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); @@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) const float *const FFTS_RESTRICT B = (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); const int N = (const int) p->N; - int i; - -#ifdef __ARM_NEON__ float *p_buf0 = buf; float *p_buf1 = buf + N - 2; float *p_out = out; -#endif + int i; /* we know this */ FFTS_ASSUME(N/2 > 0); p->plans[0]->transform(p->plans[0], input, buf); -#ifndef HAVE_SSE buf[N + 0] = buf[0]; buf[N + 1] = buf[1]; -#endif -#ifdef __ARM_NEON__ for (i = 0; i < N; i += 4) { __asm__ __volatile__ ( "vld1.32 {q8}, [%[pa]]!\n\t" @@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE3 + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +#if HAVE_SSE3 +static void +ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_load_ps(buf); __m128 t1 = _mm_load_ps(buf + N - 4); @@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } -#elif HAVE_SSE + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +#ifdef HAVE_SSE +static void +ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); @@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _MM_SHUFFLE(2,3,0,1))))); } } -#else + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +static void +ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; + for (i = 0; i < N/2; i++) { out[2*i + 0] = buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] + @@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] + buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; } -#endif out[N + 0] = buf[0] - buf[1]; out[N + 1] = 0.0f; } +#ifdef __ARM_NEON__ static void -ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) +ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output) { float *const FFTS_RESTRICT in = (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); @@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) const float *const FFTS_RESTRICT B = (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); const int N = (const int) p->N; - int i; - -#ifdef __ARM_NEON__ float *p_buf0 = in; float *p_buf1 = in + N - 2; float *p_out = buf; -#endif + int i; /* we know this */ FFTS_ASSUME(N/2 > 0); -#ifdef __ARM_NEON__ for (i = 0; i < N/2; i += 2) { __asm__ __volatile__ ( "vld1.32 {q8}, [%[pa]]!\n\t" @@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE3 + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +#if HAVE_SSE3 +static void +ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); @@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } -#elif HAVE_SSE + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +#if HAVE_SSE +static void +ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); @@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_xor_ps(t4, c0)))); } } -#else + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +static void +ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + for (i = 0; i < N/2; i++) { buf[2*i + 0] = in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + @@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; } -#endif p->plans[0]->transform(p->plans[0], buf, output); } @@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) FFTS_API ffts_plan_t* ffts_init_1d_real(size_t N, int sign) { +#ifndef __ARM_NEON__ + int cpu_flags = ffts_cpu_detect(NULL); +#endif ffts_plan_t *p; + int invert = 0; p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans)); if (!p) { return NULL; } - if (sign < 0) { - p->transform = &ffts_execute_1d_real; - } else { - p->transform = &ffts_execute_1d_real_inv; +#ifdef __ARM_NEON__ + p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv; +#else +#ifdef HAVE_SSE3 + if (cpu_flags & FFTS_CPU_X86_SSE3) { + p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3; + invert = 1; + } else +#endif +#ifdef HAVE_SSE + if (cpu_flags & FFTS_CPU_X86_SSE) { + p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse; + } else +#endif + { + p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv; } +#endif p->destroy = &ffts_free_1d_real; p->N = N; @@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign) goto cleanup; } -#ifdef HAVE_SSE3 - ffts_generate_table_1d_real_32f(p, sign, 1); -#else - ffts_generate_table_1d_real_32f(p, sign, 0); -#endif - + ffts_generate_table_1d_real_32f(p, sign, invert); return p; cleanup: |