summaryrefslogtreecommitdiffstats
path: root/lib/ffts/src/ffts_real.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/ffts/src/ffts_real.c')
-rw-r--r--lib/ffts/src/ffts_real.c218
1 files changed, 183 insertions, 35 deletions
diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c
index 0f87a12..e0f0e1f 100644
--- a/lib/ffts/src/ffts_real.c
+++ b/lib/ffts/src/ffts_real.c
@@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real.h"
+#include "ffts_cpu.h"
#include "ffts_internal.h"
#include "ffts_trig.h"
@@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <pmmintrin.h>
#elif HAVE_INTRIN_H
#include <intrin.h>
-#else
+#endif
+
/* avoid using negative zero as some configurations have problems with those */
static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
0x80000000, 0, 0x80000000, 0
@@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
0, 0x80000000, 0, 0x80000000
};
#endif
-#endif
static void
ffts_free_1d_real(ffts_plan_t *p)
@@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p)
free(p);
}
+#ifdef __ARM_NEON__
static void
-ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
@@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
- int i;
-
-#ifdef __ARM_NEON__
float *p_buf0 = buf;
float *p_buf1 = buf + N - 2;
float *p_out = out;
-#endif
+ int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
-#ifndef HAVE_SSE
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
-#endif
-#ifdef __ARM_NEON__
for (i = 0; i < N; i += 4) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE3
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
+ buf[N + 0] = buf[0];
+ buf[N + 1] = buf[1];
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_load_ps(buf);
__m128 t1 = _mm_load_ps(buf + N - 4);
@@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
-#elif HAVE_SSE
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+#ifdef HAVE_SSE
+static void
+ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
@@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_MM_SHUFFLE(2,3,0,1)))));
}
}
-#else
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+static void
+ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
+ buf[N + 0] = buf[0];
+ buf[N + 1] = buf[1];
+
for (i = 0; i < N/2; i++) {
out[2*i + 0] =
buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
@@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
}
-#endif
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
+#ifdef __ARM_NEON__
static void
-ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
- int i;
-
-#ifdef __ARM_NEON__
float *p_buf0 = in;
float *p_buf1 = in + N - 2;
float *p_out = buf;
-#endif
+ int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
-#ifdef __ARM_NEON__
for (i = 0; i < N/2; i += 2) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE3
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
__m128 t1 = _mm_load_ps(in);
@@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
-#elif HAVE_SSE
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE
+static void
+ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_xor_ps(t4, c0))));
}
}
-#else
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+static void
+ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
for (i = 0; i < N/2; i++) {
buf[2*i + 0] =
in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
@@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
}
-#endif
p->plans[0]->transform(p->plans[0], buf, output);
}
@@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
FFTS_API ffts_plan_t*
ffts_init_1d_real(size_t N, int sign)
{
+#ifndef __ARM_NEON__
+ int cpu_flags = ffts_cpu_detect(NULL);
+#endif
ffts_plan_t *p;
+ int invert = 0;
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
if (!p) {
return NULL;
}
- if (sign < 0) {
- p->transform = &ffts_execute_1d_real;
- } else {
- p->transform = &ffts_execute_1d_real_inv;
+#ifdef __ARM_NEON__
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
+#else
+#ifdef HAVE_SSE3
+ if (cpu_flags & FFTS_CPU_X86_SSE3) {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
+ invert = 1;
+ } else
+#endif
+#ifdef HAVE_SSE
+ if (cpu_flags & FFTS_CPU_X86_SSE) {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
+ } else
+#endif
+ {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
}
+#endif
p->destroy = &ffts_free_1d_real;
p->N = N;
@@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
goto cleanup;
}
-#ifdef HAVE_SSE3
- ffts_generate_table_1d_real_32f(p, sign, 1);
-#else
- ffts_generate_table_1d_real_32f(p, sign, 0);
-#endif
-
+ ffts_generate_table_1d_real_32f(p, sign, invert);
return p;
cleanup: