From 2ef6dba8728db2437def9a4fc1d3e20e0aa44c31 Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineering.com>
Date: Sat, 24 Aug 2024 13:04:45 -0500
Subject: Revup FFTS to latest upstream version

Taken from https://github.com/linkotec/ffts

Fixes ppc64el support and a handful of other bugs
---
 lib/ffts/CMakeLists.txt       |  158 +++++-
 lib/ffts/config.guess         |  463 ++++++++++--------
 lib/ffts/config.sub           |  139 +++---
 lib/ffts/ffts.pc.cmake.in     |    6 +-
 lib/ffts/include/ffts.h       |    4 +
 lib/ffts/src/Makefile.am      |    4 +-
 lib/ffts/src/codegen.c        |    4 +-
 lib/ffts/src/codegen_sse.h    |   64 +--
 lib/ffts/src/ffts.c           |  292 +++++++++++-
 lib/ffts/src/ffts_chirp_z.c   |  225 +++++++++
 lib/ffts/src/ffts_chirp_z.h   |   45 ++
 lib/ffts/src/ffts_cpu.c       |  371 +++++++++++++++
 lib/ffts/src/ffts_cpu.h       |   54 +++
 lib/ffts/src/ffts_internal.h  |  123 ++++-
 lib/ffts/src/ffts_real.c      |  218 +++++++--
 lib/ffts/src/ffts_static.c    |  586 +++++++++++++++++++++--
 lib/ffts/src/ffts_static.h    |   24 +
 lib/ffts/src/ffts_trig.c      | 1057 +++++++++++++++++++++++++++++++++--------
 lib/ffts/src/ffts_trig.h      |   12 +-
 lib/ffts/src/macros-alpha.h   |    3 -
 lib/ffts/src/macros-altivec.h |   77 ++-
 lib/ffts/src/macros-neon.h    |    3 -
 lib/ffts/src/macros-sse.h     |  223 ++++++++-
 lib/ffts/src/macros.h         |  172 ++++++-
 24 files changed, 3620 insertions(+), 707 deletions(-)
 create mode 100644 lib/ffts/src/ffts_chirp_z.c
 create mode 100644 lib/ffts/src/ffts_chirp_z.h
 create mode 100644 lib/ffts/src/ffts_cpu.c
 create mode 100644 lib/ffts/src/ffts_cpu.h

diff --git a/lib/ffts/CMakeLists.txt b/lib/ffts/CMakeLists.txt
index 459655e..748f412 100644
--- a/lib/ffts/CMakeLists.txt
+++ b/lib/ffts/CMakeLists.txt
@@ -7,7 +7,7 @@ set(FFTS_MAJOR 0)
 set(FFTS_MINOR 9)
 set(FFTS_MICRO 0)
 
-set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
+set(FFTS_VERSION "${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@@ -22,6 +22,16 @@ set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include/ffts)
 set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)
 
 # common options
+
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_AVX
+  "Enables AVX instructions." OFF
+)
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_DOUBLE
+  "Enables double precision" OFF
+)
+
 option(ENABLE_NEON
   "Enables the use of NEON instructions." OFF
 )
@@ -48,24 +58,36 @@ option(ENABLE_STATIC
 
 include(CheckCSourceCompiles)
 include(CheckCSourceRuns)
+include(CheckFunctionExists)
 include(CheckIncludeFile)
+include(CheckSymbolExists)
 
 # Ensure defined when building FFTS (as opposed to using it from
 # another project). Used to export functions from Windows DLL.
 add_definitions(-DFFTS_BUILD)
 
 # check existence of various headers
-check_include_file(malloc.h   HAVE_MALLOC_H)
-check_include_file(stdint.h   HAVE_STDINT_H)
-check_include_file(stdlib.h   HAVE_STDLIB_H)
-check_include_file(string.h   HAVE_STRING_H)
-check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
-check_include_file(unistd.h   HAVE_UNISTD_H)
+check_include_file(inttypes.h  HAVE_INTTYPES_H)
+check_include_file(malloc.h    HAVE_MALLOC_H)
+check_include_file(mm_malloc.h HAVE_MM_MALLOC_H)
+check_include_file(stdint.h    HAVE_STDINT_H)
+check_include_file(stdlib.h    HAVE_STDLIB_H)
+check_include_file(string.h    HAVE_STRING_H)
+check_include_file(sys/mman.h  HAVE_SYS_MMAN_H)
+check_include_file(unistd.h    HAVE_UNISTD_H)
+
+if(HAVE_INTTYPES_H)
+  add_definitions(-DHAVE_INTTYPES_H)
+endif(HAVE_INTTYPES_H)
 
 if(HAVE_MALLOC_H)
   add_definitions(-DHAVE_MALLOC_H)
 endif(HAVE_MALLOC_H)
 
+if(HAVE_MM_MALLOC_H)
+  add_definitions(-DHAVE_MM_MALLOC_H)
+endif(HAVE_MM_MALLOC_H)
+
 if(HAVE_STDINT_H)
   add_definitions(-DHAVE_STDINT_H)
 endif(HAVE_STDINT_H)
@@ -86,6 +108,50 @@ if(HAVE_UNISTD_H)
   add_definitions(-DHAVE_UNISTD_H)
 endif(HAVE_UNISTD_H)
 
+# check existence of various declarations
+check_symbol_exists(memalign       malloc.h HAVE_DECL_MEMALIGN)
+check_symbol_exists(posix_memalign stdlib.h HAVE_DECL_POSIX_MEMALIGN)
+check_symbol_exists(valloc         stdlib.h HAVE_DECL_VALLOC)
+check_symbol_exists(_mm_malloc     malloc.h HAVE_DECL__MM_MALLOC)
+
+if(HAVE_DECL_MEMALIGN)
+  add_definitions(-DHAVE_DECL_MEMALIGN)
+endif(HAVE_DECL_MEMALIGN)
+
+if(HAVE_DECL_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_DECL_POSIX_MEMALIGN)
+endif(HAVE_DECL_POSIX_MEMALIGN)
+
+if(HAVE_DECL_VALLOC)
+  add_definitions(-DHAVE_DECL_VALLOC)
+endif(HAVE_DECL_VALLOC)
+
+if(HAVE_DECL__MM_MALLOC)
+  add_definitions(-DHAVE_DECL__MM_MALLOC)
+endif(HAVE_DECL__MM_MALLOC)
+
+# check existence of various functions
+check_function_exists(memalign       HAVE_MEMALIGN)
+check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
+check_function_exists(valloc         HAVE_VALLOC)
+check_function_exists(_mm_malloc     HAVE__MM_MALLOC)
+
+if(HAVE_MEMALIGN)
+  add_definitions(-DHAVE_MEMALIGN)
+endif(HAVE_MEMALIGN)
+
+if(HAVE_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+if(HAVE_VALLOC)
+  add_definitions(-DHAVE_VALLOC)
+endif(HAVE_VALLOC)
+
+if(HAVE__MM_MALLOC)
+  add_definitions(-DHAVE__MM_MALLOC)
+endif(HAVE__MM_MALLOC)
+
 # backup flags
 set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
 
@@ -246,6 +312,14 @@ if(NOT CMAKE_CROSSCOMPILING)
     if(HAVE_XMMINTRIN_H)
       add_definitions(-DHAVE_SSE)
       set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+
+      # TODO: not the right place
+      if(ENABLE_AVX)
+        add_definitions(-DHAVE_AVX)
+      endif(ENABLE_AVX)
+      if(ENABLE_DOUBLE)
+        add_definitions(-DFFTS_DOUBLE)
+      endif(ENABLE_DOUBLE)
     endif(HAVE_XMMINTRIN_H)
 
     # enable SSE2 code generation
@@ -351,6 +425,10 @@ set(FFTS_HEADERS
 set(FFTS_SOURCES
   src/ffts_attributes.h
   src/ffts.c
+  src/ffts_chirp_z.c
+  src/ffts_chirp_z.h
+  src/ffts_cpu.c
+  src/ffts_cpu.h
   src/ffts_internal.h
   src/ffts_nd.c
   src/ffts_nd.h
@@ -369,6 +447,17 @@ set(FFTS_SOURCES
   src/types.h
 )
 
+if(NOT DISABLE_DYNAMIC_CODE)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+    list(APPEND FFTS_SOURCES
+      src/codegen_sse.h
+    )
+  else()
+    message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
+    set(DISABLE_DYNAMIC_CODE ON)
+  endif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+endif(NOT DISABLE_DYNAMIC_CODE)
+
 if(ENABLE_NEON)
   list(APPEND FFTS_SOURCES
     src/neon.s
@@ -393,19 +482,9 @@ elseif(HAVE_XMMINTRIN_H)
   add_definitions(-DHAVE_SSE)
 
   list(APPEND FFTS_SOURCES
+    src/macros-avx.h
     src/macros-sse.h
   )
-
-  if(NOT DISABLE_DYNAMIC_CODE)
-    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-      list(APPEND FFTS_SOURCES
-        src/codegen_sse.h
-      )
-    else()
-      message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
-      set(DISABLE_DYNAMIC_CODE ON)
-    endif(CMAKE_SIZEOF_VOID_P EQUAL 8)
-  endif(NOT DISABLE_DYNAMIC_CODE)
 endif(ENABLE_NEON)
 
 if(DISABLE_DYNAMIC_CODE)
@@ -452,6 +531,41 @@ if(ENABLE_STATIC)
 endif(ENABLE_STATIC)
 
 if(ENABLE_STATIC OR ENABLE_SHARED)
+  find_path(MPFR_INCLUDES
+    NAMES mpfr.h
+    PATHS ${INCLUDE_INSTALL_DIR}
+  )
+  find_library(MPFR_LIBRARIES mpfr PATHS ${LIB_INSTALL_DIR})
+  find_package(OpenMP)
+
+  if(MPFR_INCLUDES)
+    add_definitions(-DHAVE_MPFR_H)
+    include_directories(${MPFR_INCLUDES})
+  endif(MPFR_INCLUDES)
+
+  add_executable(ffts_trig_test
+    tests/trig_test.c
+  )
+
+  target_link_libraries(ffts_trig_test ffts)
+  if(MPFR_LIBRARIES)
+    target_link_libraries(ffts_trig_test ${MPFR_LIBRARIES})
+  endif(MPFR_LIBRARIES)
+
+  if(OPENMP_FOUND)
+    if(MSVC)
+      set_target_properties(ffts_trig_test PROPERTIES
+        COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+        LINK_FLAGS "${OpenMP_EXE_LINKER_FLAGS}"
+      )
+    else()
+      set_target_properties(ffts_trig_test PROPERTIES
+        COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+        LINK_FLAGS "${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}"
+      )
+    endif(MSVC)
+  endif(OPENMP_FOUND)
+
   add_executable(ffts_test
     tests/test.c
   )
@@ -467,6 +581,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED)
     ffts
     ${FFTS_EXTRA_LIBRARIES}
   )
+
+  add_executable(ffts_cpu_test
+    src/ffts_cpu.c
+    src/ffts_cpu.h
+    tests/cpu_test.c
+  )
+
+  set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST)
 endif(ENABLE_STATIC OR ENABLE_SHARED)
 
 # generate packageconfig file
diff --git a/lib/ffts/config.guess b/lib/ffts/config.guess
index 0967f2a..137bedf 100755
--- a/lib/ffts/config.guess
+++ b/lib/ffts/config.guess
@@ -1,12 +1,14 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2016-04-02'
+timestamp='2012-08-14'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -20,17 +22,19 @@ timestamp='2016-04-02'
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
 #
-# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
 #
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
-#
-# Please send patches to <config-patches@gnu.org>.
-
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -50,7 +54,9 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -132,27 +138,6 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
-case "${UNAME_SYSTEM}" in
-Linux|GNU|GNU/*)
-	# If the system lacks a compiler, then just pick glibc.
-	# We could probably try harder.
-	LIBC=gnu
-
-	eval $set_cc_for_build
-	cat <<-EOF > $dummy.c
-	#include <features.h>
-	#if defined(__UCLIBC__)
-	LIBC=uclibc
-	#elif defined(__dietlibc__)
-	LIBC=dietlibc
-	#else
-	LIBC=gnu
-	#endif
-	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
-	;;
-esac
-
 # Note: order is significant - the case branches are not exclusive.
 
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@@ -168,27 +153,20 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# Note: NetBSD doesn't particularly care about the vendor
 	# portion of the name.  We always set it to "unknown".
 	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
-	    /sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || \
-	    echo unknown)`
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
 	case "${UNAME_MACHINE_ARCH}" in
 	    armeb) machine=armeb-unknown ;;
 	    arm*) machine=arm-unknown ;;
 	    sh3el) machine=shl-unknown ;;
 	    sh3eb) machine=sh-unknown ;;
 	    sh5el) machine=sh5le-unknown ;;
-	    earmv*)
-		arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
-		endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
-		machine=${arch}${endian}-unknown
-		;;
 	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
 	esac
 	# The Operating System including object format, if it has switched
 	# to ELF recently, or will in the future.
 	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
 			| grep -q __ELF__
@@ -204,13 +182,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		os=netbsd
 		;;
 	esac
-	# Determine ABI tags.
-	case "${UNAME_MACHINE_ARCH}" in
-	    earm*)
-		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
-		abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
-		;;
-	esac
 	# The OS release
 	# Debian GNU/NetBSD machines have a different userland, and
 	# thus, need a distinct triplet. However, they do not need
@@ -221,13 +192,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		release='-gnu'
 		;;
 	    *)
-		release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
 		;;
 	esac
 	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
 	# contains redundant information, the shorter form:
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}${abi}"
+	echo "${machine}-${os}${release}"
 	exit ;;
     *:Bitrig:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@@ -237,10 +208,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
 	exit ;;
-    *:LibertyBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
-	exit ;;
     *:ekkoBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
 	exit ;;
@@ -253,9 +220,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:MirBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
 	exit ;;
-    *:Sortix:*:*)
-	echo ${UNAME_MACHINE}-unknown-sortix
-	exit ;;
     alpha:OSF1:*:*)
 	case $UNAME_RELEASE in
 	*4.0)
@@ -272,42 +236,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
 	case "$ALPHA_CPU_TYPE" in
 	    "EV4 (21064)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "EV4.5 (21064)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "LCA4 (21066/21068)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "EV5 (21164)")
-		UNAME_MACHINE=alphaev5 ;;
+		UNAME_MACHINE="alphaev5" ;;
 	    "EV5.6 (21164A)")
-		UNAME_MACHINE=alphaev56 ;;
+		UNAME_MACHINE="alphaev56" ;;
 	    "EV5.6 (21164PC)")
-		UNAME_MACHINE=alphapca56 ;;
+		UNAME_MACHINE="alphapca56" ;;
 	    "EV5.7 (21164PC)")
-		UNAME_MACHINE=alphapca57 ;;
+		UNAME_MACHINE="alphapca57" ;;
 	    "EV6 (21264)")
-		UNAME_MACHINE=alphaev6 ;;
+		UNAME_MACHINE="alphaev6" ;;
 	    "EV6.7 (21264A)")
-		UNAME_MACHINE=alphaev67 ;;
+		UNAME_MACHINE="alphaev67" ;;
 	    "EV6.8CB (21264C)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.8AL (21264B)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.8CX (21264D)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE=alphaev69 ;;
+		UNAME_MACHINE="alphaev69" ;;
 	    "EV7 (21364)")
-		UNAME_MACHINE=alphaev7 ;;
+		UNAME_MACHINE="alphaev7" ;;
 	    "EV7.9 (21364A)")
-		UNAME_MACHINE=alphaev79 ;;
+		UNAME_MACHINE="alphaev79" ;;
 	esac
 	# A Pn.n version is a patched version.
 	# A Vn.n version is a released version.
 	# A Tn.n version is a released field test version.
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
 	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
 	exitcode=$?
 	trap '' 0
@@ -342,7 +306,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
 	exit ;;
-    arm*:riscos:*:*|arm*:RISCOS:*:*)
+    arm:riscos:*:*|arm:RISCOS:*:*)
 	echo arm-unknown-riscos
 	exit ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@@ -380,16 +344,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	exit ;;
     i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
 	eval $set_cc_for_build
-	SUN_ARCH=i386
+	SUN_ARCH="i386"
 	# If there is a compiler, see if it is configured for 64-bit objects.
 	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
 	# This test works for both compilers.
-	if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
 	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
 		grep IS_64BIT_ARCH >/dev/null
 	    then
-		SUN_ARCH=x86_64
+		SUN_ARCH="x86_64"
 	    fi
 	fi
 	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
@@ -414,7 +378,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	exit ;;
     sun*:*:4.2BSD:*)
 	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
 	case "`/bin/arch`" in
 	    sun3)
 		echo m68k-sun-sunos${UNAME_RELEASE}
@@ -600,9 +564,8 @@ EOF
 	else
 		IBM_ARCH=powerpc
 	fi
-	if [ -x /usr/bin/lslpp ] ; then
-		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
-			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
 	else
 		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
@@ -639,13 +602,13 @@ EOF
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
 		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
 		    case "${sc_cpu_version}" in
-		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
 		      532)                      # CPU_PA_RISC2_0
 			case "${sc_kernel_bits}" in
-			  32) HP_ARCH=hppa2.0n ;;
-			  64) HP_ARCH=hppa2.0w ;;
-			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
 			esac ;;
 		    esac
 		fi
@@ -684,11 +647,11 @@ EOF
 		    exit (0);
 		}
 EOF
-		    (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
 		fi ;;
 	esac
-	if [ ${HP_ARCH} = hppa2.0w ]
+	if [ ${HP_ARCH} = "hppa2.0w" ]
 	then
 	    eval $set_cc_for_build
 
@@ -701,12 +664,12 @@ EOF
 	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
 	    # => hppa64-hp-hpux11.23
 
-	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
 		grep -q __LP64__
 	    then
-		HP_ARCH=hppa2.0w
+		HP_ARCH="hppa2.0w"
 	    else
-		HP_ARCH=hppa64
+		HP_ARCH="hppa64"
 	    fi
 	fi
 	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
@@ -811,14 +774,14 @@ EOF
 	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
-	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
 	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
 	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
 	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
@@ -848,7 +811,7 @@ EOF
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    *:MSYS*:*)
+    i*:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
     i*:windows32*:*)
@@ -896,21 +859,21 @@ EOF
 	exit ;;
     *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
 	exit ;;
     *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
 	exit ;;
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
     aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     aarch64_be:Linux:*:*)
 	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@@ -923,60 +886,59 @@ EOF
 	  EV68*) UNAME_MACHINE=alphaev68 ;;
 	esac
 	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-	exit ;;
-    arc:Linux:*:* | arceb:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
 	exit ;;
     arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
 	else
 	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
 		| grep -q __ARM_PCS_VFP
 	    then
-		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
 	    else
-		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
 	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
-	exit ;;
-    e2k:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     i*86:Linux:*:*)
-	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	LIBC=gnu
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
 	exit ;;
     ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-	exit ;;
-    k1om:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
@@ -995,63 +957,54 @@ EOF
 	#endif
 EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
 	;;
-    openrisc*:Linux:*:*)
-	echo or1k-unknown-linux-${LIBC}
-	exit ;;
-    or32:Linux:*:* | or1k*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+    or32:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     padre:Linux:*:*)
-	echo sparc-unknown-linux-${LIBC}
+	echo sparc-unknown-linux-gnu
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-${LIBC}
+	echo hppa64-unknown-linux-gnu
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
-	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
-	  *)    echo hppa-unknown-linux-${LIBC} ;;
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
 	esac
 	exit ;;
     ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-${LIBC}
+	echo powerpc64-unknown-linux-gnu
 	exit ;;
     ppc:Linux:*:*)
-	echo powerpc-unknown-linux-${LIBC}
-	exit ;;
-    ppc64le:Linux:*:*)
-	echo powerpc64le-unknown-linux-${LIBC}
-	exit ;;
-    ppcle:Linux:*:*)
-	echo powerpcle-unknown-linux-${LIBC}
+	echo powerpc-unknown-linux-gnu
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
+	echo ${UNAME_MACHINE}-ibm-linux
 	exit ;;
     sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+	echo ${UNAME_MACHINE}-dec-linux-gnu
 	exit ;;
     x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@@ -1127,7 +1080,7 @@ EOF
 	# uname -m prints for DJGPP always 'pc', but it prints nothing about
 	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configure will decide that
+	# prints for the "djgpp" host, or else GDB configury will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
 	exit ;;
@@ -1276,9 +1229,6 @@ EOF
     SX-8R:SUPER-UX:*:*)
 	echo sx8r-nec-superux${UNAME_RELEASE}
 	exit ;;
-    SX-ACE:SUPER-UX:*:*)
-	echo sxace-nec-superux${UNAME_RELEASE}
-	exit ;;
     Power*:Rhapsody:*:*)
 	echo powerpc-apple-rhapsody${UNAME_RELEASE}
 	exit ;;
@@ -1287,36 +1237,24 @@ EOF
 	exit ;;
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	eval $set_cc_for_build
-	if test "$UNAME_PROCESSOR" = unknown ; then
-	    UNAME_PROCESSOR=powerpc
-	fi
-	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
-	    if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
-		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		    (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		    grep IS_64BIT_ARCH >/dev/null
-		then
-		    case $UNAME_PROCESSOR in
-			i386) UNAME_PROCESSOR=x86_64 ;;
-			powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		    esac
-		fi
-	    fi
-	elif test "$UNAME_PROCESSOR" = i386 ; then
-	    # Avoid executing cc on OS X 10.9, as it ships with a stub
-	    # that puts up a graphical alert prompting to install
-	    # developer tools.  Any system running Mac OS X 10.7 or
-	    # later (Darwin 11 and later) is required to have a 64-bit
-	    # processor. This is not true of the ARM version of Darwin
-	    # that Apple uses in portable devices.
-	    UNAME_PROCESSOR=x86_64
-	fi
+	case $UNAME_PROCESSOR in
+	    i386)
+		eval $set_cc_for_build
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      grep IS_64BIT_ARCH >/dev/null
+		  then
+		      UNAME_PROCESSOR="x86_64"
+		  fi
+		fi ;;
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
 	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = x86; then
+	if test "$UNAME_PROCESSOR" = "x86"; then
 		UNAME_PROCESSOR=i386
 		UNAME_MACHINE=pc
 	fi
@@ -1347,7 +1285,7 @@ EOF
 	# "uname -m" is not consistent, so use $cputype instead. 386
 	# is converted to i386 for consistency with other x86
 	# operating systems.
-	if test "$cputype" = 386; then
+	if test "$cputype" = "386"; then
 	    UNAME_MACHINE=i386
 	else
 	    UNAME_MACHINE="$cputype"
@@ -1389,7 +1327,7 @@ EOF
 	echo i386-pc-xenix
 	exit ;;
     i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
 	exit ;;
     i*86:rdos:*:*)
 	echo ${UNAME_MACHINE}-pc-rdos
@@ -1400,11 +1338,156 @@ EOF
     x86_64:VMkernel:*:*)
 	echo ${UNAME_MACHINE}-unknown-esx
 	exit ;;
-    amd64:Isilon\ OneFS:*:*)
-	echo x86_64-unknown-onefs
-	exit ;;
 esac
 
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+	"4"
+#else
+	""
+#endif
+	); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
 cat >&2 <<EOF
 $0: unable to guess system type
 
@@ -1412,9 +1495,9 @@ This script, last modified $timestamp, has failed to recognize
 the operating system you are using. It is advised that you
 download the most up to date version of the config scripts from
 
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
 
 If the version you run ($0) is already up to date, please
 send the following data and any information you think might be
diff --git a/lib/ffts/config.sub b/lib/ffts/config.sub
index 8d39c4b..bdda9e4 100755
--- a/lib/ffts/config.sub
+++ b/lib/ffts/config.sub
@@ -1,18 +1,24 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2016-03-30'
+timestamp='2012-08-18'
 
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
@@ -20,12 +26,11 @@ timestamp='2016-03-30'
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.
 
 
-# Please send patches to <config-patches@gnu.org>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted GNU ChangeLog entry.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@@ -33,7 +38,7 @@ timestamp='2016-03-30'
 # Otherwise, we print the canonical config type on stdout and succeed.
 
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
 
 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
@@ -53,7 +58,8 @@ timestamp='2016-03-30'
 me=`echo "$0" | sed -e 's,.*/,,'`
 
 usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
 
 Canonicalize a configuration name.
 
@@ -67,7 +73,9 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -116,7 +124,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
   linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+  knetbsd*-gnu* | netbsd*-gnu* | \
   kopensolaris*-gnu* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
@@ -148,7 +156,7 @@ case $os in
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray | -microblaze*)
+	-apple | -axis | -knuth | -cray | -microblaze)
 		os=
 		basic_machine=$1
 		;;
@@ -251,25 +259,21 @@ case $basic_machine in
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
-	| arc | arceb \
-	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
-	| avr | avr32 \
-	| ba \
-	| be32 | be64 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+        | be32 | be64 \
 	| bfin \
-	| c4x | c8051 | clipper \
+	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
-	| e2k | epiphany \
-	| fido | fr30 | frv | ft32 \
+	| epiphany \
+	| fido | fr30 | frv \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| hexagon \
 	| i370 | i860 | i960 | ia64 \
 	| ip2k | iq2000 \
-	| k1om \
 	| le32 | le64 \
 	| lm32 \
 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
+	| maxq | mb | microblaze | mcore | mep | metag \
 	| mips | mipsbe | mipseb | mipsel | mipsle \
 	| mips16 \
 	| mips64 | mips64el \
@@ -283,29 +287,26 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipsr5900 | mipsr5900el \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
 	| moxie \
 	| mt \
 	| msp430 \
 	| nds32 | nds32le | nds32be \
-	| nios | nios2 | nios2eb | nios2el \
+	| nios | nios2 \
 	| ns16k | ns32k \
-	| open8 | or1k | or1knd | or32 \
+	| open8 \
+	| or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
-	| riscv32 | riscv64 \
 	| rl78 | rx \
 	| score \
-	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
 	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
 	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -313,7 +314,6 @@ case $basic_machine in
 	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
 	| ubicom32 \
 	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
-	| visium \
 	| we32k \
 	| x86 | xc16x | xstormy16 | xtensa \
 	| z8k | z80)
@@ -328,10 +328,7 @@ case $basic_machine in
 	c6x)
 		basic_machine=tic6x-unknown
 		;;
-	leon|leon[3-9])
-		basic_machine=sparc-$basic_machine
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
@@ -373,29 +370,26 @@ case $basic_machine in
 	| aarch64-* | aarch64_be-* \
 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
-	| ba-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| c8051-* | clipper-* | craynv-* | cydra-* \
+	| clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
-	| e2k-* | elxsi-* \
+	| elxsi-* \
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
 	| h8300-* | h8500-* \
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
 	| hexagon-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
 	| ip2k-* | iq2000-* \
-	| k1om-* \
 	| le32-* | le64-* \
 	| lm32-* \
 	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
-	| microblaze-* | microblazeel-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
 	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
 	| mips16-* \
 	| mips64-* | mips64el-* \
@@ -409,33 +403,28 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipsr5900-* | mipsr5900el-* \
 	| mipstx39-* | mipstx39el-* \
 	| mmix-* \
 	| mt-* \
 	| msp430-* \
 	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* | nios2eb-* | nios2el-* \
+	| nios-* | nios2-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
-	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
 	| pyramid-* \
-	| riscv32-* | riscv64-* \
 	| rl78-* | romp-* | rs6000-* | rx-* \
 	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
 	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
 	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
 	| tahoe-* \
 	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
 	| tile*-* \
@@ -443,7 +432,6 @@ case $basic_machine in
 	| ubicom32-* \
 	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
 	| vax-* \
-	| visium-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* \
 	| xstormy16-* | xtensa*-* \
@@ -520,9 +508,6 @@ case $basic_machine in
 		basic_machine=i386-pc
 		os=-aros
 		;;
-	asmjs)
-		basic_machine=asmjs-unknown
-		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@@ -784,9 +769,6 @@ case $basic_machine in
 		basic_machine=m68k-isi
 		os=-sysv
 		;;
-	leon-*|leon[3-9]-*)
-		basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
-		;;
 	m68knommu)
 		basic_machine=m68k-unknown
 		os=-linux
@@ -806,7 +788,7 @@ case $basic_machine in
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
-	microblaze*)
+	microblaze)
 		basic_machine=microblaze-xilinx
 		;;
 	mingw64)
@@ -814,7 +796,7 @@ case $basic_machine in
 		os=-mingw64
 		;;
 	mingw32)
-		basic_machine=i686-pc
+		basic_machine=i386-pc
 		os=-mingw32
 		;;
 	mingw32ce)
@@ -842,10 +824,6 @@ case $basic_machine in
 		basic_machine=powerpc-unknown
 		os=-morphos
 		;;
-	moxiebox)
-		basic_machine=moxie-unknown
-		os=-moxiebox
-		;;
 	msdos)
 		basic_machine=i386-pc
 		os=-msdos
@@ -854,7 +832,7 @@ case $basic_machine in
 		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
 		;;
 	msys)
-		basic_machine=i686-pc
+		basic_machine=i386-pc
 		os=-msys
 		;;
 	mvs)
@@ -1045,11 +1023,7 @@ case $basic_machine in
 		basic_machine=i586-unknown
 		os=-pw32
 		;;
-	rdos | rdos64)
-		basic_machine=x86_64-pc
-		os=-rdos
-		;;
-	rdos32)
+	rdos)
 		basic_machine=i386-pc
 		os=-rdos
 		;;
@@ -1376,13 +1350,13 @@ case $os in
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
 	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* | -plan9* \
+	      | -sym* | -kopensolaris* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* | -aros* | -cloudabi* | -sortix* \
+	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
 	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
+	      | -bitrig* | -openbsd* | -solidbsd* \
 	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
 	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
@@ -1391,15 +1365,14 @@ case $os in
 	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
 	      | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
 	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
 	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
 	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
 	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
-	      | -onefs* | -tirtos*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1523,6 +1496,9 @@ case $os in
 	-aros*)
 		os=-aros
 		;;
+	-kaos*)
+		os=-kaos
+		;;
 	-zvmoe)
 		os=-zvmoe
 		;;
@@ -1531,8 +1507,6 @@ case $os in
 		;;
 	-nacl*)
 		;;
-	-ios)
-		;;
 	-none)
 		;;
 	*)
@@ -1573,9 +1547,6 @@ case $basic_machine in
 	c4x-* | tic4x-*)
 		os=-coff
 		;;
-	c8051-*)
-		os=-elf
-		;;
 	hexagon-*)
 		os=-elf
 		;;
diff --git a/lib/ffts/ffts.pc.cmake.in b/lib/ffts/ffts.pc.cmake.in
index 43f38e9..63d4cc0 100644
--- a/lib/ffts/ffts.pc.cmake.in
+++ b/lib/ffts/ffts.pc.cmake.in
@@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${exec_prefix}
-libdir=${libdir}
-includedir=${includedir}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
 
 Name: @CMAKE_PROJECT_NAME@
 Description: fast Fourier transform library
diff --git a/lib/ffts/include/ffts.h b/lib/ffts/include/ffts.h
index cc85a88..b13316f 100644
--- a/lib/ffts/include/ffts.h
+++ b/lib/ffts/include/ffts.h
@@ -3,6 +3,7 @@
  This file is part of FFTS.
 
  Copyright (c) 2012, Anthony M. Blake
+ Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -75,6 +76,9 @@ typedef struct _ffts_plan_t ffts_plan_t;
 FFTS_API ffts_plan_t*
 ffts_init_1d(size_t N, int sign);
 
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign);
+
 FFTS_API ffts_plan_t*
 ffts_init_2d(size_t N1, size_t N2, int sign);
 
diff --git a/lib/ffts/src/Makefile.am b/lib/ffts/src/Makefile.am
index 28c7879..ff6b0cc 100644
--- a/lib/ffts/src/Makefile.am
+++ b/lib/ffts/src/Makefile.am
@@ -2,7 +2,7 @@
 
 lib_LTLIBRARIES = libffts.la
 
-libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c
+libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c ffts_chirp_z.c
 libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
 
 if DYNAMIC_DISABLED
@@ -14,7 +14,7 @@ endif
 libffts_includedir=$(includedir)/ffts
 libffts_include_HEADERS = ../include/ffts.h
 
-AM_CFLAGS = -I$(top_srcdir)/include
+AM_CFLAGS = -I$(top_srcdir)/include -DAUTOTOOLS_BUILD=yes
 
 if HAVE_VFP
 libffts_la_SOURCES += vfp.s 
diff --git a/lib/ffts/src/codegen.c b/lib/ffts/src/codegen.c
index c4e19e6..0bce616 100644
--- a/lib/ffts/src/codegen.c
+++ b/lib/ffts/src/codegen.c
@@ -139,9 +139,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
 
 #ifdef HAVE_SSE
     if (sign < 0) {
-        p->constants = sse_constants;
+        p->constants = (const void*) sse_constants;
     } else {
-        p->constants = sse_constants_inv;
+        p->constants = (const void*) sse_constants_inv;
     }
 #endif
 
diff --git a/lib/ffts/src/codegen_sse.h b/lib/ffts/src/codegen_sse.h
index e9819f1..2ca540e 100644
--- a/lib/ffts/src/codegen_sse.h
+++ b/lib/ffts/src/codegen_sse.h
@@ -488,7 +488,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_memindex(ins, X64_XMM7,  X64_RDX, offsets[0], X64_RAX, 2);
     x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
@@ -507,14 +507,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
     x64_sse_movaps_reg_memindex(ins, X64_XMM8,  X64_RDX, offsets[6], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
     x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
@@ -530,7 +530,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
     x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@@ -538,10 +538,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
     x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
 
-    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
-    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@@ -551,7 +551,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
 
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@@ -580,7 +580,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@@ -588,7 +588,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@@ -620,7 +620,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_memindex(ins, X64_XMM7,  X64_RSI, offsets[0], X64_RAX, 2);
     x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2);
@@ -640,14 +640,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
     x64_sse_movaps_reg_memindex(ins, X64_XMM3,  X64_RSI, offsets[6], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2);
     x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
@@ -663,7 +663,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
     x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@@ -671,10 +671,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
     x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);
 
-    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
-    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@@ -684,7 +684,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
 
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@@ -713,7 +713,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@@ -721,7 +721,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
     x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@@ -1157,28 +1157,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
     x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
 
-    x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
-    x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
     x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
@@ -1206,7 +1206,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
@@ -1218,7 +1218,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
     x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
     x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
 
-    x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+	x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
@@ -1257,28 +1257,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
     x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2);
 
-    x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
-    x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
     x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2);
@@ -1306,7 +1306,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
     x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
     x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
 
-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
     extend--;
 
     x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
@@ -1318,7 +1318,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
     x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
     x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
 
-    x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+	x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
     extend--;
 
     x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c
index 7fa675a..35c5cad 100644
--- a/lib/ffts/src/ffts.c
+++ b/lib/ffts/src/ffts.c
@@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ffts.h"
 
 #include "ffts_internal.h"
+#include "ffts_chirp_z.h"
 #include "ffts_static.h"
 #include "ffts_trig.h"
 #include "macros.h"
@@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = {
 };
 #endif
 
-static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_allow_execute(void *start, size_t len)
 {
     int result;
 
@@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
     return result;
 }
 
-static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_deny_execute(void *start, size_t len)
 {
     int result;
 
@@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
     return result;
 }
 
-static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+static FFTS_INLINE int
+ffts_flush_instruction_cache(void *start, size_t length)
 {
 #ifdef _WIN32
     return !FlushInstructionCache(GetCurrentProcess(), start, length);
@@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
 #endif
 }
 
-static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+static FFTS_INLINE void*
+ffts_vmem_alloc(size_t length)
 {
 #if __APPLE__
     return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
@@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
 #endif
 }
 
-static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+static FFTS_INLINE void
+ffts_vmem_free(void *addr, size_t length)
 {
 #ifdef _WIN32
     (void) length;
@@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p)
     }
 }
 
-void ffts_free_1d(ffts_plan_t *p)
+static void
+ffts_free_1d(ffts_plan_t *p)
 {
 #if !defined(DYNAMIC_DISABLED)
     if (p->transform_base) {
@@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p)
     }
 
     if (p->ws) {
-        FFTS_FREE(p->ws);
+        ffts_aligned_free(p->ws);
     }
 
     if (p->is) {
@@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
         lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
 #endif
 
-        p->ws = FFTS_MALLOC(lut_size, 32);
+        p->ws = ffts_aligned_malloc(lut_size);
         if (!p->ws) {
             goto cleanup;
         }
@@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
 
     /* calculate factors */
     m = leaf_N << (n_luts - 2);
-    tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
+    tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f));
 
     ffts_generate_cosine_sine_pow2_32f(tmp, m);
 
@@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
         p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;
 
         if (!i) {
-            ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f));
             float *fw0 = (float*) w0;
             float *fw = (float*) w;
 
@@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
             w += n/4 * 2;
 #endif
 
-            FFTS_FREE(w0);
+            ffts_aligned_free(w0);
         } else {
-            ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
-            ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
-            ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+            ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+            ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
 
             float *fw0 = (float*) w0;
             float *fw1 = (float*) w1;
@@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
             w += n/8 * 3 * 2;
 #endif
 
-            FFTS_FREE(w0);
-            FFTS_FREE(w1);
-            FFTS_FREE(w2);
+            ffts_aligned_free(w0);
+            ffts_aligned_free(w1);
+            ffts_aligned_free(w2);
         }
 
         n *= 2;
@@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
     }
 #endif
 
-    FFTS_FREE(tmp);
+    ffts_aligned_free(tmp);
 
     p->lastlut = w;
     p->n_luts = n_luts;
@@ -411,18 +418,166 @@ cleanup:
     return -1;
 }
 
+#ifdef FFTS_DOUBLE
+static int
+ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+    V4DF MULI_SIGN;
+    size_t n_luts;
+    ffts_cpx_64f *w;
+    ffts_cpx_64f *tmp;
+    size_t i, j, m, n;
+    int stride;
+
+    if (sign < 0) {
+        MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0);
+    } else {
+        MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0);
+    }
+
+    /* LUTS */
+    n_luts = ffts_ctzl(N / leaf_N);
+    if (n_luts >= 32) {
+        n_luts = 0;
+    }
+
+    if (n_luts) {
+        size_t lut_size;
+
+        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f);
+
+        p->ws = ffts_aligned_malloc(lut_size);
+        if (!p->ws) {
+            goto cleanup;
+        }
+
+        p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is));
+        if (!p->ws_is) {
+            goto cleanup;
+        }
+    }
+
+    w = p->ws;
+    n = leaf_N * 2;
+
+    /* calculate factors */
+    m = leaf_N << (n_luts - 2);
+    tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f));
+
+    ffts_generate_cosine_sine_pow2_64f(tmp, m);
+
+    /* generate lookup tables */
+    stride = 1 << (n_luts - 1);
+    for (i = 0; i < n_luts; i++) {
+        p->ws_is[i] = w - (ffts_cpx_64f*) p->ws;
+
+        if (!i) {
+            ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f));
+            double *fw0 = (double*) w0;
+            double *fw = (double*) w;
+
+            for (j = 0; j < n/4; j++) {
+                w0[j][0] = tmp[j * stride][0];
+                w0[j][1] = tmp[j * stride][1];
+            }
+
+            for (j = 0; j < n/4; j += 2) {
+                V4DF re, im, temp0;
+                temp0 = V4DF_LD(fw0 + j*2);
+                re = V4DF_DUPLICATE_RE(temp0);
+                im = V4DF_DUPLICATE_IM(temp0);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*4 + 0, re);
+                V4DF_ST(fw + j*4 + 4, im);
+            }
+
+            w += n/4 * 2;
+            ffts_aligned_free(w0);
+        } else {
+            ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+            ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+            ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+
+            double *fw0 = (double*) w0;
+            double *fw1 = (double*) w1;
+            double *fw2 = (double*) w2;
+
+            double *fw = (double*)w;
+
+            for (j = 0; j < n/8; j++) {
+                w0[j][0] = tmp[2 * j * stride][0];
+                w0[j][1] = tmp[2 * j * stride][1];
+
+                w1[j][0] = tmp[j * stride][0];
+                w1[j][1] = tmp[j * stride][1];
+
+                w2[j][0] = tmp[(j + (n/8)) * stride][0];
+                w2[j][1] = tmp[(j + (n/8)) * stride][1];
+            }
+
+            for (j = 0; j < n/8; j += 2) {
+                V4DF temp0, temp1, temp2, re, im;
+
+                temp0 = V4DF_LD(fw0 + j*2);
+                re = V4DF_DUPLICATE_RE(temp0);
+                im = V4DF_DUPLICATE_IM(temp0);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+0, re);
+                V4DF_ST(fw + j*2*6+4, im);
+
+                temp1 = V4DF_LD(fw1 + j*2);
+                re = V4DF_DUPLICATE_RE(temp1);
+                im = V4DF_DUPLICATE_IM(temp1);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+8 , re);
+                V4DF_ST(fw + j*2*6+12, im);
+
+                temp2 = V4DF_LD(fw2 + j*2);
+                re = V4DF_DUPLICATE_RE(temp2);
+                im = V4DF_DUPLICATE_IM(temp2);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+16, re);
+                V4DF_ST(fw + j*2*6+20, im);
+            }
+
+            w += n/8 * 3 * 2;
+            ffts_aligned_free(w0);
+            ffts_aligned_free(w1);
+            ffts_aligned_free(w2);
+        }
+
+        n *= 2;
+        stride >>= 1;
+    }
+
+    ffts_aligned_free(tmp);
+
+    p->lastlut = w;
+    p->n_luts = n_luts;
+    return 0;
+
+cleanup:
+    return -1;
+}
+#endif
+
 FFTS_API ffts_plan_t*
 ffts_init_1d(size_t N, int sign)
 {
     const size_t leaf_N = 8;
     ffts_plan_t *p;
 
-    if (N < 2 || (N & (N - 1)) != 0) {
-        LOG("FFT size must be a power of two\n");
+    if (N < 2) {
+        LOG("FFT size must be greater than 1");
         return NULL;
     }
 
-    p = calloc(1, sizeof(*p));
+    /* check if size is not a power of two */
+    if (N & (N - 1)) {
+        return ffts_chirp_z_init(N, sign);
+    }
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
     if (!p) {
         return NULL;
     }
@@ -537,3 +692,98 @@ cleanup:
     ffts_free_1d(p);
     return NULL;
 }
+
+#ifdef FFTS_DOUBLE
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+    const size_t leaf_N = 8;
+    ffts_plan_t *p;
+
+    if (N < 2) {
+        LOG("FFT size must be greater than 1");
+        return NULL;
+    }
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->destroy = ffts_free_1d;
+    p->N = N;
+
+    if (N >= 32) {
+        /* generate lookup tables */
+        if (ffts_generate_luts_64f(p, N, leaf_N, sign)) {
+            goto cleanup;
+        }
+
+        p->offsets = ffts_init_offsets(N, leaf_N);
+        if (!p->offsets) {
+            goto cleanup;
+        }
+
+        p->is = ffts_init_is(N, leaf_N, 1);
+        if (!p->is) {
+            goto cleanup;
+        }
+
+        p->i0 = N/leaf_N/3 + 1;
+        p->i1 = p->i2 = N/leaf_N/3;
+        if ((N/leaf_N) % 3 > 1) {
+            p->i1++;
+        }
+
+        p->i0 /= 2;
+        p->i1 /= 2;
+
+        if (sign < 0) {
+            p->transform = ffts_static_transform_f_64f;
+        } else {
+            p->transform = ffts_static_transform_i_64f;
+        }
+    } else {
+        switch (N) {
+        case 2:
+            p->transform = &ffts_small_2_64f;
+            break;
+        case 4:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward4_64f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward4_64f;
+            }
+            break;
+        case 8:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward8_64f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward8_64f;
+            }
+            break;
+        case 16:
+        default:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward16_64f;
+            } else {
+                p->transform = &ffts_small_backward16_64f;
+            }
+            break;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_1d(p);
+    return NULL;
+}
+#else
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+    /* disabled */
+    return NULL;
+}
+#endif
\ No newline at end of file
diff --git a/lib/ffts/src/ffts_chirp_z.c b/lib/ffts/src/ffts_chirp_z.c
new file mode 100644
index 0000000..e463a55
--- /dev/null
+++ b/lib/ffts/src/ffts_chirp_z.c
@@ -0,0 +1,225 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_chirp_z.h"
+
+#include "ffts_internal.h"
+#include "ffts_trig.h"
+
+/*
+*  For more information on algorithms:
+*
+*  L. I. Bluestein, A linear filtering approach to the computation of
+*  the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219
+*
+*  Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader,
+*  The Chirp z-Transform Algorithm and Its Application
+*  Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969.
+*
+*  Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm
+*  https://www.dsprelated.com/showarticle/800.php, July 7, 2015
+*/
+
+/* forward declarations */
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_free(ffts_plan_t *p)
+{
+    if (p->B)
+        ffts_aligned_free(p->B);
+
+    if (p->A)
+        ffts_aligned_free(p->A);
+
+    if (p->buf)
+        ffts_aligned_free(p->buf);
+
+    if (p->plans[0])
+        ffts_free(p->plans[0]);
+
+    free(p);
+}
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign)
+{
+    float *A, *B, reciprocal_M, *tmp;
+    ffts_plan_t *p;
+    size_t i, M;
+
+    FFTS_ASSUME(N > 2);
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
+    if (!p)
+        return NULL;
+
+    p->destroy = ffts_chirp_z_free;
+    p->N = N;
+    p->rank = 1;
+    p->plans = (ffts_plan_t**) &p[1];
+
+    if (sign < 0)
+        p->transform = ffts_chirp_z_transform_f_32f;
+    else
+        p->transform = ffts_chirp_z_transform_i_32f;
+
+    /* determinate next power of two such that M >= 2*N-1 */
+    M = ffts_next_power_of_2(2*N-1);
+    p->plans[0] = ffts_init_1d(M, FFTS_FORWARD);
+    if (!p->plans[0])
+        goto cleanup;
+
+    p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float));
+    if (!p->A)
+        goto cleanup;
+
+    p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float));
+    if (!p->B)
+        goto cleanup;
+
+    p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float));
+
+    ffts_generate_chirp_32f((ffts_cpx_32f*) A, N);
+
+    /* scale with reciprocal of length */
+    reciprocal_M = 1.0f / M;
+    tmp[0] = A[0] * reciprocal_M;
+    tmp[1] = A[1] * reciprocal_M;
+    for (i = 1; i < N; ++i) {
+        tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M;
+        tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M;
+    }
+
+    /* zero pad */
+    for (; i <= M - N; ++i)
+        tmp[2 * i] = tmp[2 * i + 1] = 0.0f;
+
+    /* FFT */
+    p->plans[0]->transform(p->plans[0], tmp, B);
+    return p;
+
+cleanup:
+    ffts_chirp_z_free(p);
+    return NULL;
+}
+
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+    const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+    size_t i, M = p->plans[0]->N, N = p->N;
+    float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+    float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+    const float *din = (const float*) in;
+    float *dout = (float*) out;
+
+    /* we know this */
+    FFTS_ASSUME(M >= 8);
+
+    /* multiply input with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1];
+        t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1];
+    }
+
+    /* zero pad */
+    for (; i < M; ++i)
+        t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+    /* convolution using FFT */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* complex multiply */
+    for (i = 0; i < M; ++i) {
+        t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+        t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+    }
+
+    /* IFFT using FFT with real and imaginary parts swapped */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* multiply output with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+        dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+    }
+}
+
+/* IFFT using FFT with real and imaginary parts swapped */
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+    const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+    size_t i, M = p->plans[0]->N, N = p->N;
+    float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+    float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+    const float *din = (const float*) in;
+    float *dout = (float*) out;
+
+    /* we know this */
+    FFTS_ASSUME(M >= 8);
+
+    /* multiply input with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1];
+        t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1];
+    }
+
+    /* zero pad */
+    for (; i < M; ++i)
+        t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+    /* convolution using FFT */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* complex multiply */
+    for (i = 0; i < M; ++i) {
+        t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+        t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+    }
+
+    /* IFFT using FFT with real and imaginary parts swapped */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* multiply output with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+        dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+    }
+}
diff --git a/lib/ffts/src/ffts_chirp_z.h b/lib/ffts/src/ffts_chirp_z.h
new file mode 100644
index 0000000..2a6aa7b
--- /dev/null
+++ b/lib/ffts/src/ffts_chirp_z.h
@@ -0,0 +1,45 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CHIRP_Z_H
+#define FFTS_CHIRP_Z_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts.h"
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign);
+
+#endif /* FFTS_CHIRP_Z_H */
diff --git a/lib/ffts/src/ffts_cpu.c b/lib/ffts/src/ffts_cpu.c
new file mode 100644
index 0000000..daf92c8
--- /dev/null
+++ b/lib/ffts/src/ffts_cpu.c
@@ -0,0 +1,371 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_cpu.h"
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+#include <stdio.h>
+#endif
+
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif
+
+/* TODO: add detection/declaration of these to CMake phase */
+#if !defined(FFTS_CPU_X64)
+#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
+/* 64 bit x86 detected */
+#define FFTS_CPU_X64
+#endif
+#endif
+
+#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
+#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+/* 32 bit x86 detected */
+#define FFTS_CPU_X86
+#endif
+#endif
+
+/* check if build is 32 bit or 64 bit x86 */
+#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
+
+/* Build and tested on
+CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
+Mac OSX 10.9 - Apple Clang 6.0
+Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
+Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
+Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
+Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
+Windows 10 Pro - Visual Studio 2017 x86/x64
+*/
+
+/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
+#define FFTS_HAVE_XGETBV
+#endif
+
+#ifndef BIT
+#define BIT(n) (1u << n)
+#endif
+
+/* bit masks */
+#define FFTS_CPU_X86_SSE_BITS    (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
+#define FFTS_CPU_X86_SSE2_BITS   (BIT(26))
+#define FFTS_CPU_X86_SSE3_BITS   (BIT(0))
+#define FFTS_CPU_X86_SSSE3_BITS  (BIT(9))
+#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
+#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
+#define FFTS_CPU_X86_AVX_BITS    (BIT(26) | BIT(27) | BIT(28))
+#define FFTS_CPU_X86_XCR0_BITS   (
+#define FFTS_CPU_X86_AVX2_BITS   (BIT(5))
+#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
+
+/* Visual Studio 2008 or older */
+#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
+#pragma optimize("", off)
+static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
+{
+    /* x64 uses a four register fast-call calling convention by default and
+       arguments are passed in registers RCX, RDX, R8, and R9. By disabling
+       optimization and passing subleaf as first argument we get __cpuidex
+    */
+    (void) subleaf;
+    __cpuid(regs, leaf);
+}
+#pragma optimize("", on)
+#endif
+
+static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
+{
+#if defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+    /* Visual Studio 2010 or newer */
+#if _MSC_VER > 1500
+    __cpuidex(regs, leaf, subleaf);
+#else
+    ffts_cpuidex(subleaf, regs, leaf);
+#endif
+#else
+    __asm {
+        mov eax, leaf
+        mov ecx, subleaf
+        mov esi, regs
+        cpuid
+        mov [esi + 0x0], eax
+        mov [esi + 0x4], ebx
+        mov [esi + 0x8], ecx
+        mov [esi + 0xc], edx
+    }
+#endif
+#elif defined(__GNUC__) && __GNUC__
+#if defined(FFTS_CPU_X64)
+    __asm__ __volatile__(
+        "cpuid\n\t"
+        : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#elif defined(__PIC__)
+    __asm__ __volatile__(
+        "xchgl %%ebx, %1\n\t"
+        "cpuid          \n\t"
+        "xchgl %%ebx, %1\n\t"
+        : "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#else
+    __asm__ __volatile__(
+        "cpuid\n\t"
+        : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#endif
+#else
+    /* unknown compiler for x86 */
+    regs[0] = regs[1] = regs[2] = regs[3] = 0;
+#endif
+}
+
+/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", off)
+#endif
+static FFTS_INLINE unsigned int ffts_get_xcr0(void)
+{
+#if defined(FFTS_HAVE_XGETBV)
+    return (unsigned int) _xgetbv(0);
+#elif defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+    /* emulate xgetbv(0) on Windows 7 SP1 or newer */
+    typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
+    PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures = 
+        (PGETENABLEDXSTATEFEATURES) GetProcAddress(
+        GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
+    return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
+#else
+    /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
+    unsigned __int32 hi, lo;
+    __asm {
+        xor ecx, ecx
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd0
+        mov lo, eax
+        mov hi, edx
+    }
+    return (unsigned int) lo;
+#endif
+#elif defined(__GNUC__) && __GNUC__
+    unsigned int lo;
+    __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
+        : "=a"(lo)
+        : "c"(0)
+        : "edx");
+    return lo;
+#else
+    /* unknown x86 compiler */
+    return 0;
+#endif
+}
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", on)
+#endif
+
+int
+ffts_cpu_detect(int *extra_flags)
+{
+    static int cpu_flags = -1;
+    static int cpu_extra_flags = -1;
+    int max_basic_func;
+    int regs[4];
+    unsigned int xcr0;
+
+    if (cpu_flags >= 0) {
+        goto exit;
+    }
+
+    /* initialize */
+    cpu_flags = cpu_extra_flags = 0;
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid check: ");
+#endif
+#if defined(FFTS_CPU_X64)
+    /* cpuid is always supported on x64 */
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("skipped\n");
+#endif
+#else
+#if defined(_MSC_VER)
+    _asm {
+        pushfd
+        pop eax
+        mov ebx,eax
+        xor eax,200000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push ebx
+        popfd
+        mov regs[0 * TYPE regs],eax
+        mov regs[1 * TYPE regs],ebx
+    }
+#else
+    __asm__ (
+        "pushfl\n\t"
+        "pop %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x200000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl\n\t"
+        : "=r" (regs[0]), "=r" (regs[1])
+    );
+#endif
+    /* check CPUID bit (bit 21) in EFLAGS register can be toggled */
+    if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
+#if defined(FFTS_BUILDING_CPU_TEST)
+        printf("not supported\n");
+#endif
+        goto exit;
+    }
+#if defined(FFTS_BUILDING_CPU_TEST)
+        printf("supported\n");
+#endif
+#endif
+
+    /* get the number of basic functions */
+    ffts_cpuid(regs, 0, 0);
+    max_basic_func = regs[0];
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
+#endif
+    if (max_basic_func == 0)
+        goto exit;
+
+    /* get feature flags */
+    ffts_cpuid(regs, 1, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+#if defined(FFTS_CPU_X64)
+    /* minimum for any x64 */
+    cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
+#else
+    /* test if SSE is supported */
+    if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
+        goto exit;
+    cpu_flags = FFTS_CPU_X86_SSE;
+
+    /* test if SSE2 is supported */
+    if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE2;
+#endif
+
+    /* test if SSE3 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE3;
+
+    /* test if SSSE3 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSSE3;
+
+    /* test if SSE4.1 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE4_1;
+
+    /* test if SSE4.2 is supported */
+    if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE4_2;
+
+    /* test if AVX is supported */
+    if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
+        goto exit;
+
+    /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
+    xcr0 = ffts_get_xcr0();
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("xcr0: %u\n", xcr0);
+#endif
+    if ((xcr0 & 0x6) != 0x6)
+        goto exit;
+
+    cpu_flags |= FFTS_CPU_X86_AVX;
+
+    /* check that cpuid extended features exist */
+    if (max_basic_func < 7)
+        goto exit;
+
+    /* get extended features */
+    ffts_cpuid(regs, 7, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+    /* test if AVX2 is supported */
+    if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_AVX2;
+
+    /* test if AVX512 is supported */
+    if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_AVX512;
+
+exit:
+    if (extra_flags) {
+        *extra_flags = cpu_extra_flags;
+    }
+    return cpu_flags;
+}
+#else 
+int
+ffts_cpu_detect(int *extra_flags)
+{
+    /* not implemented */
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("CPU detection not implemented!!\n");
+#endif
+    return 0;
+}
+#endif
\ No newline at end of file
diff --git a/lib/ffts/src/ffts_cpu.h b/lib/ffts/src/ffts_cpu.h
new file mode 100644
index 0000000..37d77e4
--- /dev/null
+++ b/lib/ffts/src/ffts_cpu.h
@@ -0,0 +1,54 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CPU_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+#define FFTS_CPU_X86_SSE    0x001
+#define FFTS_CPU_X86_SSE2   0x002
+#define FFTS_CPU_X86_SSE3   0x004
+#define FFTS_CPU_X86_SSSE3  0x008
+#define FFTS_CPU_X86_SSE4_1 0x010
+#define FFTS_CPU_X86_SSE4_2 0x020
+#define FFTS_CPU_X86_AVX    0x040
+#define FFTS_CPU_X86_AVX2   0x080
+#define FFTS_CPU_X86_AVX512 0x100
+
+int
+ffts_cpu_detect(int *extra_flags);
+
+#endif /* FFTS_CPU_H */
diff --git a/lib/ffts/src/ffts_internal.h b/lib/ffts/src/ffts_internal.h
index 157c283..04ebb9c 100644
--- a/lib/ffts/src/ffts_internal.h
+++ b/lib/ffts/src/ffts_internal.h
@@ -2,6 +2,7 @@
 
 This file is part of FFTS -- The Fastest Fourier Transform in the South
 
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
 
@@ -34,7 +35,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef FFTS_INTERNAL_H
 #define FFTS_INTERNAL_H
 
+#ifdef AUTOTOOLS_BUILD
 #include "config.h"
+#endif
+
 #include "ffts_attributes.h"
 #include "types.h"
 
@@ -42,18 +46,59 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <malloc.h>
 #endif
 
+#ifdef HAVE_MM_ALLOC_H
+#include <mm_malloc.h>
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
 #include <stddef.h>
 
-#ifdef HAVE_STDINT_H
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#elif HAVE_STDINT_H
 #include <stdint.h>
+#elif _MSC_VER
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+typedef signed long int	int32_t;
+typedef unsigned long int uint32_t;
+typedef signed long long int int64_t; 
+typedef unsigned long long int uint64_t;
 #endif
 
 #ifdef HAVE_STDLIB_H
 #include <stdlib.h>
 #endif
 
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
 #include <stdio.h>
 
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+extern void *memalign(size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_VALLOC) && !HAVE_DECL_VALLOC
+extern void *valloc(size_t);
+#endif
+
+#ifdef _mm_malloc
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
 #ifdef ENABLE_LOG
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -142,11 +187,9 @@ struct _ffts_plan_t {
      */
     size_t transform_size;
 
-    /**
-     * Points to the cosnant variables used by
-     * the Assembly Code
-     */
-    void *constants;
+    /* pointer to the constant variable used by SSE for sign change */
+    /* TODO: #ifdef HAVE_SSE */
+    const void *constants;
 
     // multi-dimensional stuff:
     struct _ffts_plan_t **plans;
@@ -171,44 +214,96 @@ struct _ffts_plan_t {
     size_t i2;
 };
 
-static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+static FFTS_INLINE void*
+ffts_aligned_malloc(size_t size)
 {
-#if defined(_WIN32)
-    return _aligned_malloc(size, 32);
+    void *p = NULL;
+
+    /* various ways to allocate aligned memory in order of preferance */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+    p = (void*) _mm_malloc(size, 32);
+#elif defined(HAVE_POSIX_MEMALIGN)
+    if (posix_memalign(&p, 32, size))
+        p = NULL;
+#elif defined(HAVE_MEMALIGN)
+    p = memalign(32, size);
+#elif defined(__ALTIVEC__)
+    p = vec_malloc(size);
+#elif defined(_MSC_VER) || defined(WIN32)
+    p = _aligned_malloc(size, 32);
+#elif defined(HAVE_VALLOC)
+    p = valloc(size);
 #else
-    return valloc(size);
+    p = malloc(size);
 #endif
+
+    return p;
 }
 
-static FFTS_INLINE void ffts_aligned_free(void *p)
+static FFTS_INLINE
+void ffts_aligned_free(void *p)
 {
-#if defined(_WIN32)
+    /* order must match with ffts_aligned_malloc */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+    _mm_free(p);
+#elif defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)
+    free(p);
+#elif defined(__ALTIVEC__)
+    vec_free(p);
+#elif defined(_MSC_VER) || defined(WIN32)
     _aligned_free(p);
 #else
+    /* valloc or malloc */
     free(p);
 #endif
 }
 
 #if GCC_VERSION_AT_LEAST(3,3)
 #define ffts_ctzl __builtin_ctzl
+
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    return 1 << (32 - __builtin_clzl(N));
+}
 #elif defined(_MSC_VER)
 #include <intrin.h>
 #ifdef _M_X64
 #pragma intrinsic(_BitScanForward64)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
 {
     unsigned long count;
     _BitScanForward64((unsigned long*) &count, N);
     return count;
 }
+
+#pragma intrinsic(_BitScanReverse64)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    unsigned long log_2;
+    _BitScanReverse64((unsigned long*)&log_2, N);
+    return 1ULL << (log_2 + 1);
+}
 #else
 #pragma intrinsic(_BitScanForward)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
 {
     unsigned long count;
     _BitScanForward((unsigned long*) &count, N);
     return count;
 }
+
+#pragma intrinsic(_BitScanReverse)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    unsigned long log_2;
+    _BitScanReverse((unsigned long*)&log_2, N);
+    return 1 << (log_2 + 1);
+}
 #endif /* _WIN64 */
 #endif /* _MSC_VER */
 
diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c
index 0f87a12..e0f0e1f 100644
--- a/lib/ffts/src/ffts_real.c
+++ b/lib/ffts/src/ffts_real.c
@@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
 
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include "ffts_real.h"
+#include "ffts_cpu.h"
 #include "ffts_internal.h"
 #include "ffts_trig.h"
 
@@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <pmmintrin.h>
 #elif HAVE_INTRIN_H
 #include <intrin.h>
-#else
+#endif
+
 /* avoid using negative zero as some configurations have problems with those */
 static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
     0x80000000, 0, 0x80000000, 0
@@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
     0, 0x80000000, 0, 0x80000000
 };
 #endif
-#endif
 
 static void
 ffts_free_1d_real(ffts_plan_t *p)
@@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p)
     free(p);
 }
 
+#ifdef __ARM_NEON__
 static void
-ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output)
 {
     float *const FFTS_RESTRICT out =
         (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
@@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
     const float *const FFTS_RESTRICT B =
         (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
     const int N = (const int) p->N;
-    int i;
-
-#ifdef __ARM_NEON__
     float *p_buf0 = buf;
     float *p_buf1 = buf + N - 2;
     float *p_out = out;
-#endif
+    int i;
 
     /* we know this */
     FFTS_ASSUME(N/2 > 0);
 
     p->plans[0]->transform(p->plans[0], input, buf);
 
-#ifndef HAVE_SSE
     buf[N + 0] = buf[0];
     buf[N + 1] = buf[1];
-#endif
 
-#ifdef __ARM_NEON__
     for (i = 0; i < N; i += 4) {
         __asm__ __volatile__ (
             "vld1.32 {q8},  [%[pa]]!\n\t"
@@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
             : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
         );
     }
-#elif HAVE_SSE3
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
+
     if (FFTS_UNLIKELY(N <= 8)) {
         __m128 t0 = _mm_load_ps(buf);
         __m128 t1 = _mm_load_ps(buf + N - 4);
@@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
                 _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
         }
     }
-#elif HAVE_SSE
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+#ifdef HAVE_SSE
+static void
+ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
     if (FFTS_UNLIKELY(N <= 8)) {
         __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
         __m128 t0 = _mm_load_ps(buf);
@@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
                 _MM_SHUFFLE(2,3,0,1)))));
         }
     }
-#else
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+static void
+ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
+
     for (i = 0; i < N/2; i++) {
         out[2*i + 0] =
             buf[    2*i + 0] * A[2*i + 0] - buf[    2*i + 1] * A[2*i + 1] +
@@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
             buf[    2*i + 1] * A[2*i + 0] + buf[    2*i + 0] * A[2*i + 1] +
             buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
     }
-#endif
 
     out[N + 0] = buf[0] - buf[1];
     out[N + 1] = 0.0f;
 }
 
+#ifdef __ARM_NEON__
 static void
-ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
 {
     float *const FFTS_RESTRICT in =
         (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
     const float *const FFTS_RESTRICT B =
         (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
     const int N = (const int) p->N;
-    int i;
-
-#ifdef __ARM_NEON__
     float *p_buf0 = in;
     float *p_buf1 = in + N - 2;
     float *p_out = buf;
-#endif
+    int i;
 
     /* we know this */
     FFTS_ASSUME(N/2 > 0);
 
-#ifdef __ARM_NEON__
     for (i = 0; i < N/2; i += 2) {
         __asm__ __volatile__ (
             "vld1.32 {q8},  [%[pa]]!\n\t"
@@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
             : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
         );
     }
-#elif HAVE_SSE3
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
     if (FFTS_UNLIKELY(N <= 8)) {
         __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
         __m128 t1 = _mm_load_ps(in);
@@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
                 _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
         }
     }
-#elif HAVE_SSE
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE
+static void
+ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
     if (FFTS_UNLIKELY(N <= 8)) {
         __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
         __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
                 _mm_xor_ps(t4, c0))));
         }
     }
-#else
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+static void
+ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
     for (i = 0; i < N/2; i++) {
         buf[2*i + 0] =
             in[    2*i + 0] * A[2*i + 0] + in[    2*i + 1] * A[2*i + 1] +
@@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
             in[    2*i + 1] * A[2*i + 0] - in[    2*i + 0] * A[2*i + 1] -
             in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
     }
-#endif
 
     p->plans[0]->transform(p->plans[0], buf, output);
 }
@@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
 FFTS_API ffts_plan_t*
 ffts_init_1d_real(size_t N, int sign)
 {
+#ifndef __ARM_NEON__
+    int cpu_flags = ffts_cpu_detect(NULL);
+#endif
     ffts_plan_t *p;
+    int invert = 0;
 
     p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
     if (!p) {
         return NULL;
     }
 
-    if (sign < 0) {
-        p->transform = &ffts_execute_1d_real;
-    } else {
-        p->transform = &ffts_execute_1d_real_inv;
+#ifdef __ARM_NEON__
+    p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
+#else
+#ifdef HAVE_SSE3
+    if (cpu_flags & FFTS_CPU_X86_SSE3) {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
+        invert = 1;
+    } else
+#endif
+#ifdef HAVE_SSE
+    if (cpu_flags & FFTS_CPU_X86_SSE) {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
+    } else
+#endif
+    {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
     }
+#endif
 
     p->destroy = &ffts_free_1d_real;
     p->N       = N;
@@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
         goto cleanup;
     }
 
-#ifdef HAVE_SSE3
-    ffts_generate_table_1d_real_32f(p, sign, 1);
-#else
-    ffts_generate_table_1d_real_32f(p, sign, 0);
-#endif
-
+    ffts_generate_table_1d_real_32f(p, sign, invert);
     return p;
 
 cleanup:
diff --git a/lib/ffts/src/ffts_static.c b/lib/ffts/src/ffts_static.c
index 09de6d7..87d8b23 100644
--- a/lib/ffts/src/ffts_static.c
+++ b/lib/ffts/src/ffts_static.c
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
 
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -258,6 +259,29 @@ static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = {
     -0.7071067811865475244008443621048490392848359376884740
 };
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_0(int inv,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3;
+
+    t0 = *r0;
+    t1 = *r1;
+
+    t2 = V4DF_ADD(*r2, *r3);
+    t3 = V4DF_IMULI(inv, V4DF_SUB(*r2, *r3));
+
+    *r0 = V4DF_ADD(t0, t2);
+    *r2 = V4DF_SUB(t0, t2);
+    *r1 = V4DF_SUB(t1, t3);
+    *r3 = V4DF_ADD(t1, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_K_0(int inv,
          V4SF *r0,
@@ -279,6 +303,31 @@ V4SF_K_0(int inv,
     *r3 = V4SF_ADD(t1, t3);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2(const double *FFTS_RESTRICT i0,
+         const double *FFTS_RESTRICT i1,
+         const double *FFTS_RESTRICT i2,
+         const double *FFTS_RESTRICT i3,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    *r0 = V4DF_ADD(t0, t1);
+    *r1 = V4DF_SUB(t0, t1);
+    *r2 = V4DF_ADD(t2, t3);
+    *r3 = V4DF_SUB(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_2(const float *FFTS_RESTRICT i0,
          const float *FFTS_RESTRICT i1,
@@ -302,6 +351,37 @@ V4SF_L_2(const float *FFTS_RESTRICT i0,
     *r3 = V4SF_SUB(t2, t3);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4(int inv,
+         const double *FFTS_RESTRICT i0,
+         const double *FFTS_RESTRICT i1,
+         const double *FFTS_RESTRICT i2,
+         const double *FFTS_RESTRICT i3,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+    *r0 = V4DF_ADD(t4, t6);
+    *r2 = V4DF_SUB(t4, t6);
+    *r1 = V4DF_SUB(t5, t7);
+    *r3 = V4DF_ADD(t5, t7);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4(int inv,
          const float *FFTS_RESTRICT i0,
@@ -331,6 +411,36 @@ V4SF_L_4(int inv,
     *r3 = V4SF_ADD(t5, t7);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double    *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+    V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+    V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+    V4DF_TX2(&r0, &r1);
+    V4DF_TX2(&r2, &r3);
+    V4DF_TX2(&r4, &r5);
+    V4DF_TX2(&r6, &r7);
+
+    V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
              const ptrdiff_t *FFTS_RESTRICT os,
@@ -359,6 +469,36 @@ V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
     V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE2(double *const FFTS_RESTRICT out,
+              const ptrdiff_t *FFTS_RESTRICT os,
+              const double *FFTS_RESTRICT in,
+              const ptrdiff_t *FFTS_RESTRICT is,
+              int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3);
+    V4DF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7);
+
+    V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+    V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+    V4DF_TX2(&r0, &r1);
+    V4DF_TX2(&r2, &r3);
+    V4DF_TX2(&r4, &r5);
+    V4DF_TX2(&r6, &r7);
+
+    V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
               const ptrdiff_t *FFTS_RESTRICT os,
@@ -387,6 +527,30 @@ V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
     V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EO(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+    V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r1, &r4, &r5);
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
              const ptrdiff_t *FFTS_RESTRICT os,
@@ -409,6 +573,30 @@ V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
     V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OE(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r6, &r7, &r2, &r3);
+    V4DF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
              const ptrdiff_t *FFTS_RESTRICT os,
@@ -431,6 +619,27 @@ V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
     V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OO(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
              const ptrdiff_t *FFTS_RESTRICT os,
@@ -450,6 +659,34 @@ V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
     V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_4(int inv,
+         double *FFTS_RESTRICT data,
+         size_t N,
+         const double *FFTS_RESTRICT LUT)
+{
+    size_t i;
+
+    for (i = 0; i < N/8; i++) {
+        V4DF r0 = V4DF_LD(data);
+        V4DF r1 = V4DF_LD(data + 2*N/4);
+        V4DF r2 = V4DF_LD(data + 4*N/4);
+        V4DF r3 = V4DF_LD(data + 6*N/4);
+
+        V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+
+        V4DF_ST(data        , r0);
+        V4DF_ST(data + 2*N/4, r1);
+        V4DF_ST(data + 4*N/4, r2);
+        V4DF_ST(data + 6*N/4, r3);
+
+        LUT += 8;
+        data += 4;
+    }
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_X_4(int inv,
          float *FFTS_RESTRICT data,
@@ -536,6 +773,68 @@ V4SF_X_8(int inv,
     }
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_8(int inv,
+         double *FFTS_RESTRICT data0,
+         size_t N,
+         const double *FFTS_RESTRICT LUT)
+{
+    double *data1 = data0 + 1*N/4;
+    double *data2 = data0 + 2*N/4;
+    double *data3 = data0 + 3*N/4;
+    double *data4 = data0 + 4*N/4;
+    double *data5 = data0 + 5*N/4;
+    double *data6 = data0 + 6*N/4;
+    double *data7 = data0 + 7*N/4;
+    size_t i;
+
+    for (i = 0; i < N/16; i++) {
+        V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+        r0 = V4DF_LD(data0);
+        r1 = V4DF_LD(data1);
+        r2 = V4DF_LD(data2);
+        r3 = V4DF_LD(data3);
+
+        V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+        r4 = V4DF_LD(data4);
+        r6 = V4DF_LD(data6);
+
+        V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r2, &r4, &r6);
+        r5 = V4DF_LD(data5);
+        r7 = V4DF_LD(data7);
+
+        V4DF_K_N(inv, V4DF_LD(LUT + 16), V4DF_LD(LUT + 20), &r1, &r3, &r5, &r7);
+        LUT += 24;
+
+        V4DF_ST(data0, r0);
+        data0 += 4;
+
+        V4DF_ST(data1, r1);
+        data1 += 4;
+
+        V4DF_ST(data2, r2);
+        data2 += 4;
+
+        V4DF_ST(data3, r3);
+        data3 += 4;
+
+        V4DF_ST(data4, r4);
+        data4 += 4;
+
+        V4DF_ST(data5, r5);
+        data5 += 4;
+
+        V4DF_ST(data6, r6);
+        data6 += 4;
+
+        V4DF_ST(data7, r7);
+        data7 += 4;
+    }
+}
+#endif
+
 static FFTS_INLINE void
 ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
                               const float *FFTS_RESTRICT in,
@@ -569,6 +868,41 @@ ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
     }
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_odd_64f(double *const FFTS_RESTRICT out,
+                              const double *FFTS_RESTRICT in,
+                              const ffts_plan_t *FFTS_RESTRICT p,
+                              int inv)
+{
+    size_t i, i0 = p->i0, i1 = p->i1;
+    const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+    const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+    for (i = i0; i > 0; --i) {
+        V4DF_LEAF_EE(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_OO(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    V4DF_LEAF_OE(out, os, in, is, inv);
+    in += 4;
+    os += 2;
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_EE2(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+}
+#endif
+
 void
 ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out)
 {
@@ -789,23 +1123,23 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out)
     V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
     const double *din = (const double*) in;
     double *dout = (double*) out;
-//  V4SF r0_1, r2_3, r4_5, r6_7;
-//  double *LUT8 = (double*) p->ws + p->ws_is[0];
+    V4DF r0_1, r2_3, r4_5, r6_7;
+    
+    /* unreferenced parameter */
     (void) p;
-    (void) din;
-    (void) dout;
 
-#if MACROS_READY
-    L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+    V4DF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut + 4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
+#endif
 
 void
 ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
@@ -823,24 +1157,23 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
     V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
     const double *din = (const double*) in;
     double *dout = (double*) out;
-//  V4SF r0_1, r2_3, r4_5, r6_7;
-//  double *LUT8 = (double*) p->ws + p->ws_is[0];
-    (void) p;
-    (void) din;
-    (void) dout;
+    V4DF r0_1, r2_3, r4_5, r6_7;
 
+    /* unreferenced parameter */
+    (void) p;
 
-#if MACROS_READY
-    L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+    V4DF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
+#endif
 
 void
 ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
@@ -862,27 +1195,27 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
     V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
     const double *din = (const double*) in;
     double *dout = (double*) out;
-//  double *LUT8 = (double*) p->ws;
-//  V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+
+    /* unreferenced parameter */
     (void) p;
-    (void) din;
-    (void) dout;
-
-#ifdef MACROS_READY
-    L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
-    L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
-    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13);
-    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
-    K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
-    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+    V4DF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    V4DF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(0, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+    V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    V4DF_K_N(0, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
+#endif
 
 void
 ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
@@ -904,27 +1237,27 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
     V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
     const double *din = (const double*) in;
     double *dout = (double*) out;
-//  double *LUT8 = (double*) p->ws;
-//  V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    
+    /* unreferenced parameter */
     (void) p;
-    (void) din;
-    (void) dout;
-
-#ifdef MACROS_READY
-    L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
-    L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
-    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13);
-    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
-    K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
-    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+    V4DF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    V4DF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(1, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+    V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    V4DF_K_N(1, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
+#endif
 
 static FFTS_INLINE void
 ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
@@ -959,6 +1292,41 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
     }
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_even_64f(double *FFTS_RESTRICT out,
+                               const double *FFTS_RESTRICT in,
+                               const ffts_plan_t *FFTS_RESTRICT p,
+                               int inv)
+{
+    size_t i, i0 = p->i0, i1 = p->i1;
+    const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+    const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+    for(i = i0; i > 0; --i) {
+        V4DF_LEAF_EE(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    V4DF_LEAF_EO(out, os, in, is, inv);
+    in += 4;
+    os += 2;
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_OO(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_EE2(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+}
+#endif
+
 static void
 ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
 {
@@ -1035,6 +1403,47 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
 #endif
 }
 
+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_f_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+    const double *ws = (const double*) p->ws;
+
+    if (N > 128) {
+        const size_t N1 = N >> 1;
+        const size_t N2 = N >> 2;
+        const size_t N3 = N >> 3;
+
+        ffts_static_rec_f_64f(p, data              , N2);
+        ffts_static_rec_f_64f(p, data +     N1     , N3);
+        ffts_static_rec_f_64f(p, data +     N1 + N2, N3);
+        ffts_static_rec_f_64f(p, data + N          , N2);
+        ffts_static_rec_f_64f(p, data + N + N1     , N2);
+
+        V4DF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+    } else if (N == 128) {
+        const double *ws1 = ws + (p->ws_is[1] << 1);
+
+        V4DF_X_8(0, data +   0,  32, ws1);
+        V4DF_X_4(0, data +  64,  16, ws);
+        V4DF_X_4(0, data +  96,  16, ws);
+        V4DF_X_8(0, data + 128,  32, ws1);
+        V4DF_X_8(0, data + 192,  32, ws1);
+
+        V4DF_X_8(0, data, 128, ws + (p->ws_is[3] << 1));
+    } else if (N == 64) {
+        V4DF_X_4(0, data +  0, 16, ws);
+        V4DF_X_4(0, data + 64, 16, ws);
+        V4DF_X_4(0, data + 96, 16, ws);
+
+        V4DF_X_8(0, data, 64, ws + (p->ws_is[2] << 1));
+    } else {
+        assert(N == 32);
+        V4DF_X_8(0, data, 32, ws + (p->ws_is[1] << 1));
+    }
+}
+#endif
+
 static void
 ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
 {
@@ -1111,6 +1520,47 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
 #endif
 }
 
+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_i_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+    const double *ws = (const double*) p->ws;
+
+    if (N > 128) {
+        const size_t N1 = N >> 1;
+        const size_t N2 = N >> 2;
+        const size_t N3 = N >> 3;
+
+        ffts_static_rec_i_64f(p, data              , N2);
+        ffts_static_rec_i_64f(p, data +     N1     , N3);
+        ffts_static_rec_i_64f(p, data +     N1 + N2, N3);
+        ffts_static_rec_i_64f(p, data + N          , N2);
+        ffts_static_rec_i_64f(p, data + N + N1     , N2);
+
+        V4DF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+    } else if (N == 128) {
+        const double *ws1 = ws + (p->ws_is[1] << 1);
+
+        V4DF_X_8(1, data +   0, 32, ws1);
+        V4DF_X_4(1, data +  64, 16, ws);
+        V4DF_X_4(1, data +  96, 16, ws);
+        V4DF_X_8(1, data + 128, 32, ws1);
+        V4DF_X_8(1, data + 192, 32, ws1);
+
+        V4DF_X_8(1, data, 128, ws + (p->ws_is[3] << 1));
+    } else if (N == 64) {
+        V4DF_X_4(1, data +  0, 16, ws);
+        V4DF_X_4(1, data + 64, 16, ws);
+        V4DF_X_4(1, data + 96, 16, ws);
+
+        V4DF_X_8(1, data, 64, ws + (p->ws_is[2] << 1));
+    } else {
+        assert(N == 32);
+        V4DF_X_8(1, data, 32, ws + (p->ws_is[1] << 1));
+    }
+}
+#endif
+
 void
 ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
 {
@@ -1172,6 +1622,26 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
 #endif
 }
 
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out)
+{
+    const double *din = (const double*) in;
+    double *dout = (double*) out;
+
+    const size_t N = p->N;
+    const int N_log_2 = ffts_ctzl(N);
+
+    if (N_log_2 & 1) {
+        ffts_static_firstpass_odd_64f(dout, din, p, 0);
+    } else {
+        ffts_static_firstpass_even_64f(dout, din, p, 0);
+    }
+
+    ffts_static_rec_f_64f(p, dout, N);
+}
+#endif
+
 void
 ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
 {
@@ -1231,4 +1701,24 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
 
     ffts_static_rec_i_32f(p, dout, N);
 #endif
-}
\ No newline at end of file
+}
+
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out)
+{
+    const double *din = (const double*) in;
+    double *dout = (double*) out;
+
+    const size_t N = p->N;
+    const int N_log_2 = ffts_ctzl(N);
+
+    if (N_log_2 & 1) {
+        ffts_static_firstpass_odd_64f(dout, din, p, 1);
+    } else {
+        ffts_static_firstpass_even_64f(dout, din, p, 1);
+    }
+
+    ffts_static_rec_i_64f(p, dout, N);
+}
+#endif
\ No newline at end of file
diff --git a/lib/ffts/src/ffts_static.h b/lib/ffts/src/ffts_static.h
index 5a42fc2..5de0059 100644
--- a/lib/ffts/src/ffts_static.h
+++ b/lib/ffts/src/ffts_static.h
@@ -43,49 +43,73 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 void
 ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
 
 void
 ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
 void
 ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out);
 
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
 #endif /* FFTS_STATIC_H */
diff --git a/lib/ffts/src/ffts_trig.c b/lib/ffts/src/ffts_trig.c
index 74ebfd2..65efa86 100644
--- a/lib/ffts/src/ffts_trig.c
+++ b/lib/ffts/src/ffts_trig.c
@@ -2,7 +2,7 @@
 
 This file is part of FFTS -- The Fastest Fourier Transform in the South
 
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -33,193 +33,707 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ffts_trig.h"
 #include "ffts_dd.h"
 
+/*
+*  For more information on algorithms:
+*
+*  D. Potts, G. Steidl, M. Tasche, Numerical stability of fast
+*  trigonometric transforms � a worst case study,
+*  J. Concrete Appl. Math. 1 (2003) 1�36
+*
+*  O. Buneman, Stable on�line creation of sines and cosines of
+*  successive angles, Proc. IEEE 75, 1434 � 1435 (1987).
+*/
+
+/* An union to initialize doubles using byte presentation,
+*  and to avoid breaking strict-aliasing rules
+*/
+
+/* TODO: we need macros to take care endianess */
+typedef union ffts_double {
+    int32_t i[2];
+    double  d;
+} ffts_double_t;
+
 /* 1/(2*cos(pow(2,-p)*pi)) */
-static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
-    0x00000000, 0x3fe00000, 0xc9be45de, 0x3be3bd3c,
-    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c03bd3c,
-    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c23bd3c,
-    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c43bd3c,
-    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c63bd3c,
-    0x00000000, 0x3fe00000, 0xc9be45df, 0x3c83bd3c,
-    0x00000001, 0x3fe00000, 0x4df22efd, 0x3c7de9e6,
-    0x00000005, 0x3fe00000, 0x906e8725, 0xbc60b0cd,
-    0x00000014, 0x3fe00000, 0x906e8357, 0xbc80b0cd,
-    0x0000004f, 0x3fe00000, 0x0dce83c9, 0xbc5619b2,
-    0x0000013c, 0x3fe00000, 0x0dc6e79a, 0xbc7619b2,
-    0x000004ef, 0x3fe00000, 0xe4af1240, 0x3c83cc9b,
-    0x000013bd, 0x3fe00000, 0x2d14c08a, 0x3c7e64df,
-    0x00004ef5, 0x3fe00000, 0x47a85465, 0xbc59b20b,
-    0x00013bd4, 0x3fe00000, 0xab79c897, 0xbc79b203,
-    0x0004ef4f, 0x3fe00000, 0x15019a96, 0x3c79386b,
-    0x0013bd3d, 0x3fe00000, 0x7d6dbf4b, 0xbc7b16b7,
-    0x004ef4f3, 0x3fe00000, 0xf30832e0, 0x3c741ee4,
-    0x013bd3cd, 0x3fe00000, 0xd3bcd4bb, 0xbc83f41e,
-    0x04ef4f34, 0x3fe00000, 0xdd75aebb, 0xbc82ef06,
-    0x13bd3cde, 0x3fe00000, 0xb2b41b3d, 0x3c52d979,
-    0x4ef4f46c, 0x3fe00000, 0x4f0fb458, 0xbc851db3,
-    0x3bd3e0e7, 0x3fe00001, 0x8a0ce3f0, 0x3c58dbab,
-    0xef507722, 0x3fe00004, 0x2a8ec295, 0x3c83e351,
-    0xbd5114f9, 0x3fe00013, 0xc4c0d92d, 0x3c8b3ca4,
-    0xf637de7d, 0x3fe0004e, 0xb74de729, 0x3c45974e,
-    0xe8190891, 0x3fe0013b, 0x26edf4da, 0xbc814c20,
-    0x9436640e, 0x3fe004f0, 0xe2b34b50, 0x3c8091ab,
-    0x9c61d971, 0x3fe013d1, 0x6ce01b8e, 0x3c7f7df7,
-    0xd17cba53, 0x3fe0503e, 0x74ad7633, 0xbc697609,
-    0x7bdb3895, 0x3fe1517a, 0x82f9091b, 0xbc8008d1,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000
+static const FFTS_ALIGN(16) ffts_double_t half_secant[66] = {
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3be3bd3c } },
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c03bd3c } },
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c23bd3c } },
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c43bd3c } },
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c63bd3c } },
+    { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45df, 0x3c83bd3c } },
+    { { 0x00000001, 0x3fe00000 } }, { { 0x4df22efd, 0x3c7de9e6 } },
+    { { 0x00000005, 0x3fe00000 } }, { { 0x906e8725, 0xbc60b0cd } },
+    { { 0x00000014, 0x3fe00000 } }, { { 0x906e8357, 0xbc80b0cd } },
+    { { 0x0000004f, 0x3fe00000 } }, { { 0x0dce83c9, 0xbc5619b2 } },
+    { { 0x0000013c, 0x3fe00000 } }, { { 0x0dc6e79a, 0xbc7619b2 } },
+    { { 0x000004ef, 0x3fe00000 } }, { { 0xe4af1240, 0x3c83cc9b } },
+    { { 0x000013bd, 0x3fe00000 } }, { { 0x2d14c08a, 0x3c7e64df } },
+    { { 0x00004ef5, 0x3fe00000 } }, { { 0x47a85465, 0xbc59b20b } },
+    { { 0x00013bd4, 0x3fe00000 } }, { { 0xab79c897, 0xbc79b203 } },
+    { { 0x0004ef4f, 0x3fe00000 } }, { { 0x15019a96, 0x3c79386b } },
+    { { 0x0013bd3d, 0x3fe00000 } }, { { 0x7d6dbf4b, 0xbc7b16b7 } },
+    { { 0x004ef4f3, 0x3fe00000 } }, { { 0xf30832e0, 0x3c741ee4 } },
+    { { 0x013bd3cd, 0x3fe00000 } }, { { 0xd3bcd4bb, 0xbc83f41e } },
+    { { 0x04ef4f34, 0x3fe00000 } }, { { 0xdd75aebb, 0xbc82ef06 } },
+    { { 0x13bd3cde, 0x3fe00000 } }, { { 0xb2b41b3d, 0x3c52d979 } },
+    { { 0x4ef4f46c, 0x3fe00000 } }, { { 0x4f0fb458, 0xbc851db3 } },
+    { { 0x3bd3e0e7, 0x3fe00001 } }, { { 0x8a0ce3f0, 0x3c58dbab } },
+    { { 0xef507722, 0x3fe00004 } }, { { 0x2a8ec295, 0x3c83e351 } },
+    { { 0xbd5114f9, 0x3fe00013 } }, { { 0xc4c0d92d, 0x3c8b3ca4 } },
+    { { 0xf637de7d, 0x3fe0004e } }, { { 0xb74de729, 0x3c45974e } },
+    { { 0xe8190891, 0x3fe0013b } }, { { 0x26edf4da, 0xbc814c20 } },
+    { { 0x9436640e, 0x3fe004f0 } }, { { 0xe2b34b50, 0x3c8091ab } },
+    { { 0x9c61d971, 0x3fe013d1 } }, { { 0x6ce01b8e, 0x3c7f7df7 } },
+    { { 0xd17cba53, 0x3fe0503e } }, { { 0x74ad7633, 0xbc697609 } },
+    { { 0x7bdb3895, 0x3fe1517a } }, { { 0x82f9091b, 0xbc8008d1 } },
+    { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } },
+    { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }
 };
 
 /* cos(pow(2,-p)*pi), sin(pow(2,-p)*pi) */
-static const FFTS_ALIGN(16) unsigned int cos_sin_pi_table[264] = {
-    0x00000000, 0x3ff00000, 0x54442d18, 0x3df921fb,
-    0xc9be45de, 0xbbf3bd3c, 0xbb77974f, 0x3a91a390,
-    0x00000000, 0x3ff00000, 0x54442d18, 0x3e0921fb,
-    0xc9be45de, 0xbc13bd3c, 0x54a14928, 0x3aa19bd0,
-    0x00000000, 0x3ff00000, 0x54442d18, 0x3e1921fb,
-    0xc9be45de, 0xbc33bd3c, 0xb948108a, 0x3ab17cce,
-    0x00000000, 0x3ff00000, 0x54442d18, 0x3e2921fb,
-    0xc9be45de, 0xbc53bd3c, 0x4be32e14, 0x3ac100c8,
-    0x00000000, 0x3ff00000, 0x54442d18, 0x3e3921fb,
-    0xc9be45de, 0xbc73bd3c, 0x2c9f4879, 0x3ace215d,
-    0xffffffff, 0x3fefffff, 0x54442d18, 0x3e4921fb,
-    0x6c837443, 0x3c888586, 0x0005f376, 0x3acd411f,
-    0xfffffffe, 0x3fefffff, 0x54442d18, 0x3e5921fb,
-    0x4df22ef1, 0xbc8de9e6, 0x9937209e, 0xbaf7b153,
-    0xfffffff6, 0x3fefffff, 0x54442d16, 0x3e6921fb,
-    0x906e88aa, 0x3c70b0cd, 0xfe19968a, 0xbb03b7c0,
-    0xffffffd9, 0x3fefffff, 0x54442d0e, 0x3e7921fb,
-    0xdf22ed26, 0xbc8e9e64, 0x8d1b6ffb, 0xbaee8bb4,
-    0xffffff62, 0x3fefffff, 0x54442cef, 0x3e8921fb,
-    0x0dd18f0f, 0x3c6619b2, 0x7f2b20fb, 0xbb00e133,
-    0xfffffd88, 0x3fefffff, 0x54442c73, 0x3e9921fb,
-    0x0dd314b2, 0x3c8619b2, 0x619fdf6e, 0xbb174e98,
-    0xfffff621, 0x3fefffff, 0x54442a83, 0x3ea921fb,
-    0x3764acf5, 0x3c8866c8, 0xf5b2407f, 0xbb388215,
-    0xffffd886, 0x3fefffff, 0x544422c2, 0x3eb921fb,
-    0x20e7a944, 0xbc8e64df, 0x7b9b9f23, 0x3b5a0961,
-    0xffff6216, 0x3fefffff, 0x544403c1, 0x3ec921fb,
-    0x52ee25ea, 0x3c69b20e, 0x4df6a86a, 0xbb5999d9,
-    0xfffd8858, 0x3fefffff, 0x544387ba, 0x3ed921fb,
-    0xd8910ead, 0x3c89b20f, 0x0809d04d, 0x3b77d9db,
-    0xfff62162, 0x3fefffff, 0x544197a1, 0x3ee921fb,
-    0x438d3925, 0xbc8937a8, 0xa5d27f7a, 0xbb858b02,
-    0xffd88586, 0x3fefffff, 0x5439d73a, 0x3ef921fb,
-    0x94b3ddd2, 0x3c8b22e4, 0xf8a3b73d, 0xbb863c7f,
-    0xff62161a, 0x3fefffff, 0x541ad59e, 0x3f0921fb,
-    0x7ea469b2, 0xbc835c13, 0xb8cee262, 0x3bae9860,
-    0xfd885867, 0x3fefffff, 0x539ecf31, 0x3f1921fb,
-    0x23a32e63, 0xbc77d556, 0xfcd23a30, 0x3b96b111,
-    0xf621619c, 0x3fefffff, 0x51aeb57c, 0x3f2921fb,
-    0xbbbd8fe6, 0xbc87507d, 0x4916c435, 0xbbca6e1d,
-    0xd8858675, 0x3fefffff, 0x49ee4ea6, 0x3f3921fb,
-    0x54748eab, 0xbc879f0e, 0x744a453e, 0x3bde894d,
-    0x62161a34, 0x3fefffff, 0x2aecb360, 0x3f4921fb,
-    0xb1f9b9c4, 0xbc6136dc, 0x7e566b4c, 0x3be87615,
-    0x88586ee6, 0x3feffffd, 0xaee6472e, 0x3f5921fa,
-    0xf173ae5b, 0x3c81af64, 0x284a9df8, 0xbbfee52e,
-    0x21621d02, 0x3feffff6, 0xbecca4ba, 0x3f6921f8,
-    0xebc82813, 0xbc76acfc, 0x7bcab5b2, 0x3c02ba40,
-    0x858e8a92, 0x3fefffd8, 0xfe670071, 0x3f7921f0,
-    0x1883bcf7, 0x3c8359c7, 0xfe6b7a9b, 0x3bfab967,
-    0x169b92db, 0x3fefff62, 0xfcdec784, 0x3f8921d1,
-    0xc81fbd0d, 0x3c85dda3, 0xbe836d9d, 0x3c29878e,
-    0x6084cd0d, 0x3feffd88, 0xf7a3667e, 0x3f992155,
-    0x4556e4cb, 0xbc81354d, 0x091a0130, 0xbbfb1d63,
-    0xe3796d7e, 0x3feff621, 0xf10dd814, 0x3fa91f65,
-    0x2e24aa15, 0xbc6c57bc, 0x0d569a90, 0xbc2912bd,
-    0xa3d12526, 0x3fefd88d, 0xbc29b42c, 0x3fb917a6,
-    0x378811c7, 0xbc887df6, 0xd26ed688, 0xbc3e2718,
-    0xcff75cb0, 0x3fef6297, 0x3c69a60b, 0x3fc8f8b8,
-    0x2a361fd3, 0x3c756217, 0xb9ff8d82, 0xbc626d19,
-    0xcf328d46, 0x3fed906b, 0xa6aea963, 0x3fd87de2,
-    0x10231ac2, 0x3c7457e6, 0xd3d5a610, 0xbc672ced,
-    0x667f3bcd, 0x3fe6a09e, 0x667f3bcd, 0x3fe6a09e,
-    0x13b26456, 0xbc8bdd34, 0x13b26456, 0xbc8bdd34,
-    0x00000000, 0x00000000, 0x00000000, 0x3ff00000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000
+static const FFTS_ALIGN(32) ffts_double_t cos_sin_pi_table[132] = {
+    { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3df921fb } },
+    { { 0xc9be45de, 0xbbf3bd3c } }, { { 0xbb77974f, 0x3a91a390 } },
+    { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e0921fb } },
+    { { 0xc9be45de, 0xbc13bd3c } }, { { 0x54a14928, 0x3aa19bd0 } },
+    { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e1921fb } },
+    { { 0xc9be45de, 0xbc33bd3c } }, { { 0xb948108a, 0x3ab17cce } },
+    { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e2921fb } },
+    { { 0xc9be45de, 0xbc53bd3c } }, { { 0x4be32e14, 0x3ac100c8 } },
+    { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e3921fb } },
+    { { 0xc9be45de, 0xbc73bd3c } }, { { 0x2c9f4879, 0x3ace215d } },
+    { { 0xffffffff, 0x3fefffff } }, { { 0x54442d18, 0x3e4921fb } },
+    { { 0x6c837443, 0x3c888586 } }, { { 0x0005f376, 0x3acd411f } },
+    { { 0xfffffffe, 0x3fefffff } }, { { 0x54442d18, 0x3e5921fb } },
+    { { 0x4df22ef1, 0xbc8de9e6 } }, { { 0x9937209e, 0xbaf7b153 } },
+    { { 0xfffffff6, 0x3fefffff } }, { { 0x54442d16, 0x3e6921fb } },
+    { { 0x906e88aa, 0x3c70b0cd } }, { { 0xfe19968a, 0xbb03b7c0 } },
+    { { 0xffffffd9, 0x3fefffff } }, { { 0x54442d0e, 0x3e7921fb } },
+    { { 0xdf22ed26, 0xbc8e9e64 } }, { { 0x8d1b6ffb, 0xbaee8bb4 } },
+    { { 0xffffff62, 0x3fefffff } }, { { 0x54442cef, 0x3e8921fb } },
+    { { 0x0dd18f0f, 0x3c6619b2 } }, { { 0x7f2b20fb, 0xbb00e133 } },
+    { { 0xfffffd88, 0x3fefffff } }, { { 0x54442c73, 0x3e9921fb } },
+    { { 0x0dd314b2, 0x3c8619b2 } }, { { 0x619fdf6e, 0xbb174e98 } },
+    { { 0xfffff621, 0x3fefffff } }, { { 0x54442a83, 0x3ea921fb } },
+    { { 0x3764acf5, 0x3c8866c8 } }, { { 0xf5b2407f, 0xbb388215 } },
+    { { 0xffffd886, 0x3fefffff } }, { { 0x544422c2, 0x3eb921fb } },
+    { { 0x20e7a944, 0xbc8e64df } }, { { 0x7b9b9f23, 0x3b5a0961 } },
+    { { 0xffff6216, 0x3fefffff } }, { { 0x544403c1, 0x3ec921fb } },
+    { { 0x52ee25ea, 0x3c69b20e } }, { { 0x4df6a86a, 0xbb5999d9 } },
+    { { 0xfffd8858, 0x3fefffff } }, { { 0x544387ba, 0x3ed921fb } },
+    { { 0xd8910ead, 0x3c89b20f } }, { { 0x0809d04d, 0x3b77d9db } },
+    { { 0xfff62162, 0x3fefffff } }, { { 0x544197a1, 0x3ee921fb } },
+    { { 0x438d3925, 0xbc8937a8 } }, { { 0xa5d27f7a, 0xbb858b02 } },
+    { { 0xffd88586, 0x3fefffff } }, { { 0x5439d73a, 0x3ef921fb } },
+    { { 0x94b3ddd2, 0x3c8b22e4 } }, { { 0xf8a3b73d, 0xbb863c7f } },
+    { { 0xff62161a, 0x3fefffff } }, { { 0x541ad59e, 0x3f0921fb } },
+    { { 0x7ea469b2, 0xbc835c13 } }, { { 0xb8cee262, 0x3bae9860 } },
+    { { 0xfd885867, 0x3fefffff } }, { { 0x539ecf31, 0x3f1921fb } },
+    { { 0x23a32e63, 0xbc77d556 } }, { { 0xfcd23a30, 0x3b96b111 } },
+    { { 0xf621619c, 0x3fefffff } }, { { 0x51aeb57c, 0x3f2921fb } },
+    { { 0xbbbd8fe6, 0xbc87507d } }, { { 0x4916c435, 0xbbca6e1d } },
+    { { 0xd8858675, 0x3fefffff } }, { { 0x49ee4ea6, 0x3f3921fb } },
+    { { 0x54748eab, 0xbc879f0e } }, { { 0x744a453e, 0x3bde894d } },
+    { { 0x62161a34, 0x3fefffff } }, { { 0x2aecb360, 0x3f4921fb } },
+    { { 0xb1f9b9c4, 0xbc6136dc } }, { { 0x7e566b4c, 0x3be87615 } },
+    { { 0x88586ee6, 0x3feffffd } }, { { 0xaee6472e, 0x3f5921fa } },
+    { { 0xf173ae5b, 0x3c81af64 } }, { { 0x284a9df8, 0xbbfee52e } },
+    { { 0x21621d02, 0x3feffff6 } }, { { 0xbecca4ba, 0x3f6921f8 } },
+    { { 0xebc82813, 0xbc76acfc } }, { { 0x7bcab5b2, 0x3c02ba40 } },
+    { { 0x858e8a92, 0x3fefffd8 } }, { { 0xfe670071, 0x3f7921f0 } },
+    { { 0x1883bcf7, 0x3c8359c7 } }, { { 0xfe6b7a9b, 0x3bfab967 } },
+    { { 0x169b92db, 0x3fefff62 } }, { { 0xfcdec784, 0x3f8921d1 } },
+    { { 0xc81fbd0d, 0x3c85dda3 } }, { { 0xbe836d9d, 0x3c29878e } },
+    { { 0x6084cd0d, 0x3feffd88 } }, { { 0xf7a3667e, 0x3f992155 } },
+    { { 0x4556e4cb, 0xbc81354d } }, { { 0x091a0130, 0xbbfb1d63 } },
+    { { 0xe3796d7e, 0x3feff621 } }, { { 0xf10dd814, 0x3fa91f65 } },
+    { { 0x2e24aa15, 0xbc6c57bc } }, { { 0x0d569a90, 0xbc2912bd } },
+    { { 0xa3d12526, 0x3fefd88d } }, { { 0xbc29b42c, 0x3fb917a6 } },
+    { { 0x378811c7, 0xbc887df6 } }, { { 0xd26ed688, 0xbc3e2718 } },
+    { { 0xcff75cb0, 0x3fef6297 } }, { { 0x3c69a60b, 0x3fc8f8b8 } },
+    { { 0x2a361fd3, 0x3c756217 } }, { { 0xb9ff8d82, 0xbc626d19 } },
+    { { 0xcf328d46, 0x3fed906b } }, { { 0xa6aea963, 0x3fd87de2 } },
+    { { 0x10231ac2, 0x3c7457e6 } }, { { 0xd3d5a610, 0xbc672ced } },
+    { { 0x667f3bcd, 0x3fe6a09e } }, { { 0x667f3bcd, 0x3fe6a09e } },
+    { { 0x13b26456, 0xbc8bdd34 } }, { { 0x13b26456, 0xbc8bdd34 } },
+    { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x3ff00000 } },
+    { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }
+};
+
+#define COS_SIN_TABLE_SIZE 260
+
+/* cos(pi*k/256), sin(pi*k/256) */
+static const FFTS_ALIGN(32) ffts_double_t cos_sin_table[COS_SIN_TABLE_SIZE] = {
+    { { 0x00000000, 0x3FF00000 } }, { { 0x00000000, 0x00000000 } },
+    { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } },
+    { { 0x169B92DB, 0x3FEFFF62 } }, { { 0xFCDEC784, 0x3F8921D1 } },
+    { { 0xC81FBD0D, 0x3C85DDA3 } }, { { 0xBE836D9D, 0x3C29878E } },
+    { { 0x6084CD0D, 0x3FEFFD88 } }, { { 0xF7A3667E, 0x3F992155 } },
+    { { 0x4556E4CB, 0xBC81354D } }, { { 0x091A0130, 0xBBFB1D63 } },
+    { { 0xEFFEF75D, 0x3FEFFA72 } }, { { 0x759455CD, 0x3FA2D865 } },
+    { { 0xCDB25956, 0xBC88B4CD } }, { { 0x5BA93AC0, 0x3C2686F6 } },
+    { { 0xE3796D7E, 0x3FEFF621 } }, { { 0xF10DD814, 0x3FA91F65 } },
+    { { 0x2E24AA15, 0xBC6C57BC } }, { { 0x0D569A90, 0xBC2912BD } },
+    { { 0x658E71AD, 0x3FEFF095 } }, { { 0x79F820E0, 0x3FAF656E } },
+    { { 0xE18A4B9E, 0x3C801A8C } }, { { 0xE392BFFE, 0xBC22E1EB } },
+    { { 0xAD01883A, 0x3FEFE9CD } }, { { 0x92CE19F6, 0x3FB2D520 } },
+    { { 0xD0C67E35, 0x3C6521EC } }, { { 0xA8BF6B2C, 0xBC49A088 } },
+    { { 0xFCBD5B09, 0x3FEFE1CA } }, { { 0x0A9AA419, 0x3FB5F6D0 } },
+    { { 0x202A884E, 0x3C6A23E3 } }, { { 0xD03F6C9A, 0xBC4F4022 } },
+    { { 0xA3D12526, 0x3FEFD88D } }, { { 0xBC29B42C, 0x3FB917A6 } },
+    { { 0x378811C7, 0xBC887DF6 } }, { { 0xD26ED688, 0xBC3E2718 } },
+    { { 0xFD6DA67B, 0x3FEFCE15 } }, { { 0xC79EC2D5, 0x3FBC3785 } },
+    { { 0x830D4C09, 0xBC75DD6F } }, { { 0xF133FB21, 0xBC24F39D } },
+    { { 0x70E19FD3, 0x3FEFC264 } }, { { 0x56A9730E, 0x3FBF564E } },
+    { { 0x68ECACEE, 0x3C81EC86 } }, { { 0x729AE56D, 0x3C4A2704 } },
+    { { 0x7195D741, 0x3FEFB579 } }, { { 0xCEDAF577, 0x3FC139F0 } },
+    { { 0x7397CC08, 0x3C71BFAC } }, { { 0x4D1B3CFA, 0xBC652343 } },
+    { { 0x7F08A517, 0x3FEFA755 } }, { { 0x6E8E613A, 0x3FC2C810 } },
+    { { 0xCA13571F, 0xBC87A0A8 } }, { { 0xA89A11E0, 0x3C513000 } },
+    { { 0x24C9099B, 0x3FEF97F9 } }, { { 0xB1293E5A, 0x3FC45576 } },
+    { { 0xEEA5963B, 0xBC8E2AE0 } }, { { 0x4119F7B1, 0xBC5285A2 } },
+    { { 0xFA714BA9, 0x3FEF8764 } }, { { 0x448B3FC6, 0x3FC5E214 } },
+    { { 0x778FFCB6, 0x3C7AB256 } }, { { 0x779DDAC6, 0x3C6531FF } },
+    { { 0xA3A12077, 0x3FEF7599 } }, { { 0xDE50BF31, 0x3FC76DD9 } },
+    { { 0xD743195C, 0x3C884F31 } }, { { 0xEC501B2F, 0x3C61D5EE } },
+    { { 0xCFF75CB0, 0x3FEF6297 } }, { { 0x3C69A60B, 0x3FC8F8B8 } },
+    { { 0x2A361FD3, 0x3C756217 } }, { { 0xB9FF8D82, 0xBC626D19 } },
+    { { 0x3B0B2F2D, 0x3FEF4E60 } }, { { 0x25B00451, 0x3FCA82A0 } },
+    { { 0xE695AC05, 0xBC78EE01 } }, { { 0xFFD084AD, 0xBC687905 } },
+    { { 0xAC64E589, 0x3FEF38F3 } }, { { 0x6A7E4F63, 0x3FCC0B82 } },
+    { { 0xB51F72E6, 0xBC7D7BAF } }, { { 0x9E521935, 0xBC1AF143 } },
+    { { 0xF7763ADA, 0x3FEF2252 } }, { { 0xE5454311, 0x3FCD934F } },
+    { { 0x1C8D94AB, 0xBC820CB8 } }, { { 0x277107AD, 0x3C675B92 } },
+    { { 0xFB9230D7, 0x3FEF0A7E } }, { { 0x7B215F1B, 0x3FCF19F9 } },
+    { { 0xDC6B4989, 0x3C752C7A } }, { { 0xF11DA2C4, 0xBC642DEE } },
+    { { 0xA3E473C2, 0x3FEEF178 } }, { { 0x0E37FDAE, 0x3FD04FB8 } },
+    { { 0x67FE774F, 0x3C86310A } }, { { 0xB72583CC, 0xBC0412CD } },
+    { { 0xE7684963, 0x3FEED740 } }, { { 0x62B1F677, 0x3FD111D2 } },
+    { { 0x91F59CC2, 0x3C7E82C7 } }, { { 0x0AB7AA9A, 0x3C7824C2 } },
+    { { 0xC8DF0B74, 0x3FEEBBD8 } }, { { 0x3F4CDB3E, 0x3FD1D344 } },
+    { { 0x615E7277, 0x3C7C6C8C } }, { { 0x1C13519E, 0xBC6720D4 } },
+    { { 0x56C62DDA, 0x3FEE9F41 } }, { { 0x2ED59F06, 0x3FD29406 } },
+    { { 0xE2E3F81E, 0x3C8760B1 } }, { { 0xA2C4612D, 0xBC75D28D } },
+    { { 0xAB4CD10D, 0x3FEE817B } }, { { 0xC2E18152, 0x3FD35410 } },
+    { { 0x686B5E0A, 0xBC7D0AFE } }, { { 0x2F96E062, 0xBC73CB00 } },
+    { { 0xEC48E112, 0x3FEE6288 } }, { { 0x94176601, 0x3FD4135C } },
+    { { 0xF2847754, 0xBC616B56 } }, { { 0x4AFA2518, 0x3C70C97C } },
+    { { 0x4B2BC17E, 0x3FEE426A } }, { { 0x4278E76A, 0x3FD4D1E2 } },
+    { { 0x89744882, 0x3C8A8738 } }, { { 0x18792858, 0x3C624172 } },
+    { { 0x04F686E5, 0x3FEE2121 } }, { { 0x75AB1FDD, 0x3FD58F9A } },
+    { { 0x6C126527, 0xBC8014C7 } }, { { 0xD58CF620, 0xBC1EFDC0 } },
+    { { 0x622DBE2B, 0x3FEDFEAE } }, { { 0xDD3F27C6, 0x3FD64C7D } },
+    { { 0x88425567, 0xBC8514EA } }, { { 0x4A664121, 0x3C510D2B } },
+    { { 0xB6CCC23C, 0x3FEDDB13 } }, { { 0x30FA459F, 0x3FD70885 } },
+    { { 0xC6107DB3, 0x3C883C37 } }, { { 0xE0864C5D, 0xBC744B19 } },
+    { { 0x6238A09B, 0x3FEDB652 } }, { { 0x311DCCE7, 0x3FD7C3A9 } },
+    { { 0xEAE69460, 0xBC7ADEE7 } }, { { 0x1EF3E8D9, 0x3C19A3F2 } },
+    { { 0xCF328D46, 0x3FED906B } }, { { 0xA6AEA963, 0x3FD87DE2 } },
+    { { 0x10231AC2, 0x3C7457E6 } }, { { 0xD3D5A610, 0xBC672CED } },
+    { { 0x73C9E68B, 0x3FED6961 } }, { { 0x63BC93D7, 0x3FD9372A } },
+    { { 0xC6393D55, 0xBC7E8C61 } }, { { 0x9E5AD5B1, 0x3C668431 } },
+    { { 0xD14DC93A, 0x3FED4134 } }, { { 0x43A8ED8A, 0x3FD9EF79 } },
+    { { 0x95D25AF2, 0xBC84EF52 } }, { { 0x290BDBAB, 0x3C66DA81 } },
+    { { 0x743E35DC, 0x3FED17E7 } }, { { 0x2B6D3FCA, 0x3FDAA6C8 } },
+    { { 0x3540130A, 0xBC5101DA } }, { { 0x6EE5CCF7, 0xBC7D5F10 } },
+    { { 0xF43CC773, 0x3FECED7A } }, { { 0x09E15CC0, 0x3FDB5D10 } },
+    { { 0xB5AB58AE, 0xBC5E7B6B } }, { { 0xCB974183, 0x3C65B362 } },
+    { { 0xF3FCFC5C, 0x3FECC1F0 } }, { { 0xD8011EE7, 0x3FDC1249 } },
+    { { 0x3B68F6AB, 0x3C7E5761 } }, { { 0xBB515206, 0xBC7813AA } },
+    { { 0x213411F5, 0x3FEC954B } }, { { 0x9931C45E, 0x3FDCC66E } },
+    { { 0x1E946603, 0xBC52FB76 } }, { { 0x59C37F8F, 0x3C56850E } },
+    { { 0x3488739B, 0x3FEC678B } }, { { 0x5B86E389, 0x3FDD7977 } },
+    { { 0xC7C5FF5B, 0x3C6D86CA } }, { { 0x87BC0575, 0x3C7550EC } },
+    { { 0xF180BDB1, 0x3FEC38B2 } }, { { 0x3806F63B, 0x3FDE2B5D } },
+    { { 0x757C8D07, 0xBC76E0B1 } }, { { 0x1D3C6841, 0x3C5E0D89 } },
+    { { 0x26725549, 0x3FEC08C4 } }, { { 0x52EF78D6, 0x3FDEDC19 } },
+    { { 0xD80E2946, 0x3C5B157F } }, { { 0xC33EDEE6, 0xBC7DD0F7 } },
+    { { 0xAC6F952A, 0x3FEBD7C0 } }, { { 0xDBF89ABA, 0x3FDF8BA4 } },
+    { { 0x32AC700A, 0xBC8825A7 } }, { { 0xC1B776B8, 0xBC32EC1F } },
+    { { 0x673590D2, 0x3FEBA5AA } }, { { 0x874C3EB7, 0x3FE01CFC } },
+    { { 0x370753B6, 0x3C87EA4E } }, { { 0xE7C2368C, 0xBC734A35 } },
+    { { 0x45196E3E, 0x3FEB7283 } }, { { 0x9922FFEE, 0x3FE07387 } },
+    { { 0x324E6D61, 0xBC8BC69F } }, { { 0x4347406C, 0xBC8A5A01 } },
+    { { 0x3EF55712, 0x3FEB3E4D } }, { { 0x4D5D898F, 0x3FE0C970 } },
+    { { 0xBF11A493, 0xBC8EB6B8 } }, { { 0xDE6EE9B2, 0xBC88D3D7 } },
+    { { 0x58150200, 0x3FEB090A } }, { { 0x541B4B23, 0x3FE11EB3 } },
+    { { 0x300FFCCE, 0xBC8926DA } }, { { 0x69ABE4F1, 0xBC8EF23B } },
+    { { 0x9E21D511, 0x3FEAD2BC } }, { { 0x63DEDB49, 0x3FE1734D } },
+    { { 0x07BEA548, 0xBC847FBE } }, { { 0xCCC50575, 0xBC87EEF2 } },
+    { { 0x290EA1A3, 0x3FEA9B66 } }, { { 0x39AE68C8, 0x3FE1C73B } },
+    { { 0xE8B6DAC8, 0x3C39F630 } }, { { 0x267F6600, 0x3C8B25DD } },
+    { { 0x1B02FAE2, 0x3FEA6309 } }, { { 0x9933EB59, 0x3FE21A79 } },
+    { { 0x52248D10, 0xBC7E9111 } }, { { 0x77C68FB2, 0xBC83A7B1 } },
+    { { 0xA0462782, 0x3FEA29A7 } }, { { 0x4CDD12DF, 0x3FE26D05 } },
+    { { 0x015DF175, 0xBC7128BB } }, { { 0x3EF3770C, 0xBC85DA74 } },
+    { { 0xEF29AF94, 0x3FE9EF43 } }, { { 0x25FAF3EA, 0x3FE2BEDB } },
+    { { 0xB60445C2, 0x3C7B1DFC } }, { { 0xC796EE46, 0xBC514981 } },
+    { { 0x47F38741, 0x3FE9B3E0 } }, { { 0xFCE17035, 0x3FE30FF7 } },
+    { { 0x86712474, 0xBC830EE2 } }, { { 0x26F74A6F, 0xBC6EFCC6 } },
+    { { 0xF4C7D742, 0x3FE9777E } }, { { 0xB10659F3, 0x3FE36058 } },
+    { { 0xA240665E, 0xBC815479 } }, { { 0xA35857E7, 0xBC81FCB3 } },
+    { { 0x499263FB, 0x3FE93A22 } }, { { 0x292050B9, 0x3FE3AFFA } },
+    { { 0xA920DF0B, 0x3C83D419 } }, { { 0xE3954964, 0x3C7E3E25 } },
+    { { 0xA3EF940D, 0x3FE8FBCC } }, { { 0x534556D4, 0x3FE3FED9 } },
+    { { 0x9C86F2F1, 0xBC66DFA9 } }, { { 0x608C5061, 0x3C836916 } },
+    { { 0x6B151741, 0x3FE8BC80 } }, { { 0x25091DD6, 0x3FE44CF3 } },
+    { { 0x2ED1336D, 0xBC82C5E1 } }, { { 0x2CFDC6B3, 0x3C68076A } },
+    { { 0x0FBA2EBF, 0x3FE87C40 } }, { { 0x9B9B0939, 0x3FE49A44 } },
+    { { 0x0C3F64CD, 0xBC82DABC } }, { { 0x6D719B94, 0xBC827EE1 } },
+    { { 0x0BFF976E, 0x3FE83B0E } }, { { 0xBBE3E5E9, 0x3FE4E6CA } },
+    { { 0xF8EA3475, 0xBC76F420 } }, { { 0xEDCEB327, 0x3C63C293 } },
+    { { 0xE3571771, 0x3FE7F8EC } }, { { 0x92A35596, 0x3FE53282 } },
+    { { 0xCE93C917, 0xBC89C8D8 } }, { { 0x89DA0257, 0xBC7A12EB } },
+    { { 0x226AAFAF, 0x3FE7B5DF } }, { { 0x348CECA0, 0x3FE57D69 } },
+    { { 0xACDF0AD7, 0xBC70F537 } }, { { 0x992BFBB2, 0xBC875720 } },
+    { { 0x5F037261, 0x3FE771E7 } }, { { 0xBE65018C, 0x3FE5C77B } },
+    { { 0x8D84068F, 0x3C75CFCE } }, { { 0x9C0BC32A, 0x3C8069EA } },
+    { { 0x37EFFF96, 0x3FE72D08 } }, { { 0x551D2CDF, 0x3FE610B7 } },
+    { { 0x0F1D915C, 0x3C80D4EF } }, { { 0x52FF2A37, 0xBC7251B3 } },
+    { { 0x54EAA8AF, 0x3FE6E744 } }, { { 0x25F0783D, 0x3FE65919 } },
+    { { 0xC84E226E, 0xBC8DBC03 } }, { { 0xFBF5DE23, 0x3C8C3D64 } },
+    { { 0x667F3BCD, 0x3FE6A09E } }, { { 0x667F3BCD, 0x3FE6A09E } },
+    { { 0x13B26456, 0xBC8BDD34 } }, { { 0x13B26456, 0xBC8BDD34 } }
 };
 
+/* cos(pi * x), x=[0;1/512] */
+static const FFTS_ALIGN(16) ffts_double_t cos_coeff[3] = {
+    { { 0xC9BE45DE, 0xC013BD3C } },
+    { { 0x081749FA, 0x40103C1F } },
+    { { 0x047EE98B, 0xBFF55D10 } }
+};
+
+/* sin(pi * x), x=[0;1/512] */
+static const FFTS_ALIGN(16) ffts_double_t sin_coeff[4] = {
+    { { 0x54442D18, 0x400921FB } },
+    { { 0xE62154CA, 0xC014ABBC } },
+    { { 0xCEF16BFE, 0x40046675 } },
+    { { 0xADE54A87, 0x40339228 } }
+};
+
+#ifndef M_1_256
+#define M_1_256 3.90625e-3
+#endif
+
+static int
+ffts_cexp_32f64f(size_t n, size_t d, double *out);
+
+/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
 int
-ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size)
+ffts_cexp_32f(size_t n, size_t d, float *output)
 {
-    double alpha, beta;
-    double c[2], s[2];
-    double x, z;
-    int i;
+    double FFTS_ALIGN(16) z[2];
 
-    if (!table || !table_size) {
+    if (!d || !output)
         return -1;
+
+    /* reduction */
+    if (FFTS_UNLIKELY(n >= d))
+        n %= d;
+
+    ffts_cexp_32f64f(n, d, z);
+
+    output[0] = (float) z[0];
+    output[1] = (float) z[1];
+    return 0;
+}
+
+/* used as intermediate result for single precision calculations */
+static int
+ffts_cexp_32f64f(size_t n, size_t d, double *output)
+{
+    const ffts_double_t *ct = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_32(cos_sin_table);
+    const ffts_double_t *cc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(cos_coeff);
+    const ffts_double_t *sc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(sin_coeff);
+    double *out = FFTS_ASSUME_ALIGNED_16(output);
+    double c, s, cos_a, cos_b, sin_a, sin_b;
+    double cos_sign, sin_sign, sign, x, z;
+    int i, j, swap;
+
+    /* we know this */
+    FFTS_ASSUME(d > 0);
+    FFTS_ASSUME(n < d);
+
+    /* determinate octant */
+    if (n > d - n) {
+        sin_sign = -1.0;
+        n = d - n;
+    } else {
+        sin_sign = 1.0;
     }
 
-    /* the first */
-    table[0][0] =  1.0f;
-    table[0][1] = -0.0f;
+    n <<= 1;
+    if (n > d - n) {
+        cos_sign = -1.0;
+        swap = 1;
+        n += n - d;
+    } else {
+        cos_sign = 1.0;
+        swap = 0;
+        n <<= 1;
+    }
 
-    if (FFTS_UNLIKELY(table_size == 1)) {
-        goto exit;
+    if (n > d - n) {
+        swap ^= 1;
+        n = d - n;
     }
 
-    if (FFTS_UNLIKELY(table_size == 2)) {
-        /* skip over */
-        i = 1;
-        goto mid_point;
+    /* "binary long division" */
+    for (i = 0, j = (1 << 5), n <<= 1; j && n; j >>= 1) {
+        if (n > d - n) {
+            n += n - d;
+            i += j;
+        } else {
+            n <<= 1;
+        }
+    }
+
+    /* decide between two table values */
+    if (n > d - n) {
+        i++;
+        n = d - n;
+        sign = -1.0;
+    } else {
+        sign = 1.0;
     }
 
-    /* polynomial approximations calculated using Sollya */
-    x = 1.0 / table_size;
+    /* divide by 256 is exact (as is the multiply with its reciprocal) */
+    x = ((double) n / d) * M_1_256;
+
+    /* 0 <= x <= 1/512 */
     z = x * x;
 
-    /* alpha = 2 * sin(M_PI_4 / m) * sin(M_PI_4 / m) */
-    alpha = x * (1.1107207345394952717884501203293686870741139540138 +
-        z * (-0.114191397993514079911985272577099412137126013186879 +
-        z * 3.52164670852685621720746817665316575239342815885835e-3));
-    alpha = alpha * alpha;
+    /* table lookup */
+    cos_a = ct[4 * i + 0].d;
+    sin_a = ct[4 * i + 2].d;
 
-    /* beta = sin(M_PI_2 / m) */
-    beta = x * (1.57079632679489455959753740899031981825828552246094 +
-        z * (-0.64596409735041482313988581154262647032737731933593 +
-        z * 7.9690915468332887416913479228242067620158195495605e-2));
+    /* evaluate polynomials */
+    cos_b = 1.0 + z * (cc[0].d + z * (cc[1].d + z * cc[2].d));
+    sin_b = x * (sc[0].d + z * (sc[1].d + z * (sc[2].d + z * sc[3].d)));
 
-    /* cos(0) = 1.0, sin(0) = 0.0 */
-    c[0] = 1.0;
-    s[0] = 0.0;
+    /* sum or difference of angles */
+    c = cos_a * cos_b - sign * sin_a * sin_b;
+    s = sin_a * cos_b + sign * cos_a * sin_b;
+
+    if (swap) {
+        double tmp = c;
+        c = s;
+        s = tmp;
+    }
+
+    out[0] = cos_sign * c;
+    out[1] = sin_sign * s;
+    return 0;
+}
+
+int
+ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size)
+{
+    ffts_cpx_32f *lut;
+    size_t i, j, n;
 
-    /* generate sine and cosine tables with maximum error less than 1 ULP */
-    for (i = 1; i < (table_size + 1)/2; i++) {
-        c[1] = c[0] - ((alpha * c[0]) + (beta * s[0]));
-        s[1] = s[0] - ((alpha * s[0]) - (beta * c[0]));
+    if (!table || !table_size)
+        return -1;
+
+    n = 2 * table_size;
+    lut = ffts_aligned_malloc(n * sizeof(*lut));
+    if (!lut)
+        return -1;
 
-        table[i          + 0][0] = (float)  c[1];
-        table[i          + 0][1] = (float) -s[1];
-        table[table_size - i][0] = (float)  s[1];
-        table[table_size - i][1] = (float) -c[1];
+    /* initialize LUT */
+    ffts_generate_cosine_sine_32f(lut, n);
 
-        c[0] = c[1];
-        s[0] = s[1];
+    /* generate CZT sequence */
+    for (i = 0, j = 0; i < table_size; ++i) {
+        table[i][0] = lut[j][0];
+        table[i][1] = lut[j][1];
+
+        j += 2 * i + 1;
+        if (j >= n)
+            j -= n;
     }
 
-    if (FFTS_UNLIKELY(table_size & 1)) {
+    ffts_aligned_free(lut);
+    return 0;
+}
+
+/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP
+*  using repeated subvector scaling algorithm, 16 - 20 times faster than
+*  direct library calling algorithm.
+*/
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size)
+{
+    ffts_cpx_64f *const tmp = (ffts_cpx_64f *const) table;
+    double FFTS_ALIGN(16) z[2], zz[2], x[2], xx[2];
+    size_t i, j, k, len;
+
+    if (!table || !table_size)
+        return -1;
+
+    if (FFTS_UNLIKELY(table_size == 1))
         goto exit;
-    }
 
-mid_point:
-    table[i][0] =  0.70710677f;
-    table[i][1] = -0.70710677f;
+    /* check if table size is a power of two */
+    if (!(table_size & (table_size - 1)))
+        return ffts_generate_cosine_sine_pow2_32f(table, table_size);
+
+    if (!(table_size & 1)) {
+        /* even table size -- check if multiply of four */
+        if (!(table_size & 3)) {
+            /* multiply of four */
+            len = table_size >> 2;
+            for (j = 1; 4 * j <= len; j <<= 1) {
+                ffts_cexp_32f64f(j, table_size, z);
+
+                tmp[j][0] = z[0];
+                tmp[j][1] = z[1];
+
+                tmp[len - j][0] = z[1];
+                tmp[len - j][1] = z[0];
+
+                for (i = 1; i < j; i++) {
+                    zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                    zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                    tmp[j + i][0] = zz[0];
+                    tmp[j + i][1] = zz[1];
+
+                    tmp[len - j - i][0] = zz[1];
+                    tmp[len - j - i][1] = zz[0];
+                }
+            }
+
+            /* this loops zero or one times */
+            for (k = j << 1; k <= len; j <<= 1) {
+                ffts_cexp_32f64f(j, table_size, z);
+
+                tmp[j][0] = z[0];
+                tmp[j][1] = z[1];
+                if (k++ == len)
+                    break;
+
+                tmp[len - j][0] = z[1];
+                tmp[len - j][1] = z[0];
+                if (k++ == len)
+                    break;
+
+                for (i = 1; i < j; i++) {
+                    zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                    zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                    tmp[j + i][0] = zz[0];
+                    tmp[j + i][1] = zz[1];
+                    if (k++ == len)
+                        break;
+
+                    tmp[len - j - i][0] = zz[1];
+                    tmp[len - j - i][1] = zz[0];
+                    if (k++ == len)
+                        break;
+                }
+            }
+
+            /* convert doubles to floats */
+            for (i = 1; i < len; i++) {
+                table[i][0] = (float) tmp[i][0];
+                table[i][1] = (float) tmp[i][1];
+            }
+
+            table[len][0] = 0.0f;
+            table[len][1] = 1.0f;
+
+            for (i = 1; i <= len; i++) {
+                table[len + i][0] = -table[i][1];
+                table[len + i][1] =  table[i][0];
+            }
+        } else {
+            /* multiply of two */
+            len = table_size >> 1;
+            for (j = 1; 4 * j <= len; j <<= 1) {
+                ffts_cexp_32f64f(j, table_size, z);
+
+                tmp[j][0] = z[0];
+                tmp[j][1] = z[1];
+
+                tmp[len - j][0] = -z[0];
+                tmp[len - j][1] = z[1];
+
+                for (i = 1; i < j; i++) {
+                    zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                    zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                    tmp[j + i][0] = zz[0];
+                    tmp[j + i][1] = zz[1];
+
+                    tmp[len - j - i][0] = -zz[0];
+                    tmp[len - j - i][1] = zz[1];
+                }
+            }
+
+            /* this loops zero or one times */
+            for (k = j << 1; k <= len; j <<= 1) {
+                ffts_cexp_32f64f(j, table_size, z);
+
+                tmp[j][0] = z[0];
+                tmp[j][1] = z[1];
+                if (k++ == len)
+                    break;
+
+                tmp[len - j][0] = -z[0];
+                tmp[len - j][1] = z[1];
+                if (k++ == len)
+                    break;
+
+                for (i = 1; i < j; i++) {
+                    zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                    zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                    tmp[j + i][0] = zz[0];
+                    tmp[j + i][1] = zz[1];
+                    if (k++ == len)
+                        break;
+
+                    tmp[len - j - i][0] = -zz[0];
+                    tmp[len - j - i][1] = zz[1];
+                    if (k++ == len)
+                        break;
+                }
+            }
+
+            /* convert doubles to floats */
+            for (i = 1; i < len; i++) {
+                table[i][0] = (float) tmp[i][0];
+                table[i][1] = (float) tmp[i][1];
+            }
+
+            table[len][0] = -1.0f;
+            table[len][1] = 0.0f;
+        }
+
+        /* duplicate lower half to higher */
+        len = table_size >> 1;
+        for (i = 1; i < len; i++) {
+            table[table_size - i][0] = table[i][0];
+            table[table_size - i][1] = -table[i][1];
+        }
+    } else {
+        /* odd table size */
+
+        /* to avoid using temporary tables, generate the first 1/8 of table in
+        *  double precision on lower half (and using the symmetry store
+        *  the last 1/8 of table in single precision on higher half)
+        */
+        for (j = 1; 8 * j < table_size; j <<= 1) {
+            ffts_cexp_32f64f(j, table_size, z);
+
+            /* store double precision to lower half */
+            tmp[j][0] = z[0];
+            tmp[j][1] = z[1];
+
+            /* store single precision to higher half */
+            table[table_size - j][0] = (float) z[0];
+            table[table_size - j][1] = (float) -z[1];
+
+            for (i = 1; i < j; i++) {
+                /* use double precision for intermediate calculations */
+                zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                tmp[i + j][0] = zz[0];
+                tmp[i + j][1] = zz[1];
+
+                table[table_size - i - j][0] = (float) zz[0];
+                table[table_size - i - j][1] = (float) -zz[1];
+            }
+        }
+
+        /* now generate 1/2 of table in single precision on higher half */
+        k = j << 1;
+        ffts_cexp_32f64f(j, table_size, z);
+        ffts_cexp_32f64f(k, table_size, x);
+
+        /* store single precision to higher half */
+        table[table_size - j][0] = (float) z[0];
+        table[table_size - j][1] = (float) -z[1];
+
+        table[table_size - k][0] = (float) x[0];
+        table[table_size - k][1] = (float) -x[1];
+
+        i = 1;
+        len = ((table_size + 1) >> 1) - k;
+        if (len > j) {
+            len -= j;
+
+            xx[0] = x[0] * z[0] - x[1] * z[1];
+            xx[1] = x[1] * z[0] + x[0] * z[1];
+
+            table[table_size - k - j][0] = (float) xx[0];
+            table[table_size - k - j][1] = (float) -xx[1];
+
+            for (; i < len; i++) {
+                zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+                zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+                table[table_size - i - j][0] = (float) zz[0];
+                table[table_size - i - j][1] = (float) -zz[1];
+
+                xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1];
+                xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1];
+
+                table[table_size - i - k][0] = (float) xx[0];
+                table[table_size - i - k][1] = (float) -xx[1];
+
+                xx[0] = x[0] * zz[0] - x[1] * zz[1];
+                xx[1] = x[1] * zz[0] + x[0] * zz[1];
+
+                table[table_size - i - k - j][0] = (float) xx[0];
+                table[table_size - i - k - j][1] = (float) -xx[1];
+            }
+
+            len = j;
+        }
+
+        for (; i < len; i++) {
+            zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+            zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+            table[table_size - i - j][0] = (float) zz[0];
+            table[table_size - i - j][1] = (float) -zz[1];
+
+            xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1];
+            xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1];
+
+            table[table_size - i - k][0] = (float) xx[0];
+            table[table_size - i - k][1] = (float) -xx[1];
+        }
+
+        for (; i < j; i++) {
+            zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+            zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+            table[table_size - i - j][0] = (float) zz[0];
+            table[table_size - i - j][1] = (float) -zz[1];
+        }
+
+        /* duplicate higher half to lower */
+        len = table_size >> 1;
+        for (i = 1; i <= len; i++) {
+            table[i][0] = table[table_size - i][0];
+            table[i][1] = -table[table_size - i][1];
+        }
+    }
 
 exit:
+    /* cos(0) = 1.0, sin(0) = 0.0 */
+    table[0][0] = 1.0f;
+    table[0][1] = 0.0f;
     return 0;
 }
 
 /* Oscar Buneman's method for generating a sequence of sines and cosines.
 *  Expired US Patent 4,878,187 A
-*
-*  D. Potts, G. Steidl, M. Tasche, Numerical stability of fast
-*  trigonometric transforms � a worst case study,
-*  J. Concrete Appl. Math. 1 (2003) 1�36
-*
-*  O. Buneman, Stable on�line creation of sines and cosines of
-*  successive angles, Proc. IEEE 75, 1434 � 1435 (1987).
 */
 #if HAVE_SSE2
 int
@@ -227,10 +741,11 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
 {
     static const __m128d sign_swap = { 0.0, -0.0 };
     const __m128d *FFTS_RESTRICT ct;
-    const double *FFTS_RESTRICT hs;
+    const ffts_double_t *FFTS_RESTRICT cst;
+    const ffts_double_t *FFTS_RESTRICT hs;
     __m128d FFTS_ALIGN(16) w[32];
     __m128d FFTS_ALIGN(16) h[32];
-    int i, log_2, offset;
+    int i, log_2, offset, step;
 
     /* size must be a power of two */
     if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -251,21 +766,42 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
         goto mid_point;
     }
 
+    cst = (const ffts_double_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+    /* generate small tables from lookup table */
+    if (table_size <= 128) {
+        step = 128 / table_size;
+
+        for (i = 1; i < table_size/2; i++) {
+            float cosine = (float) cst[4 * i * step + 0].d;
+            float sine   = (float) cst[4 * i * step + 1].d;
+
+            table[i          + 0][0] = cosine;
+            table[i          + 0][1] = -sine;
+            table[table_size - i][0] = sine;
+            table[table_size - i][1] = -cosine;
+        }
+
+        goto mid_point;
+    }
+
     /* calculate table offset */
-    FFTS_ASSUME(table_size/2 > 1);
+    FFTS_ASSUME(table_size/2 > 64);
     log_2 = ffts_ctzl(table_size);
     FFTS_ASSUME(log_2 > 1);
     offset = 32 - log_2;
+    step = log_2 - 8;
     ct = (const __m128d*)
-        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
-    hs = (const double*) &half_secant[4 * offset];
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+    hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
         w[i] = ct[2*i];
 
         /* duplicate the high part */
-        h[i] = _mm_set1_pd(hs[2*i]);
+        h[i] = _mm_set1_pd(hs[2*i].d);
     }
 
     /* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -279,9 +815,20 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
         _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps(
             _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap)));
 
-        /* skip and find next trailing zero */
-        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-        w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
+        /* use lookup table when possible */
+        if (log_2 > step) {
+            offset = ((2 * i) >> step) + (4 << (log_2 - step));
+            if (offset >= COS_SIN_TABLE_SIZE) {
+                offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4;
+                w[log_2] = _mm_loadr_pd(&cst[offset].d);
+            } else {
+                w[log_2] = _mm_load_pd(&cst[offset].d);
+            }
+        } else {
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
+        }
     }
 
 mid_point:
@@ -297,11 +844,12 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
 {
     static const __m128d sign_swap = { 0.0, -0.0 };
     const struct ffts_dd2_t *FFTS_RESTRICT ct;
-    const double *FFTS_RESTRICT hs;
+    const ffts_double_t *FFTS_RESTRICT cst;
+    const ffts_double_t *FFTS_RESTRICT hs;
     struct ffts_dd2_t FFTS_ALIGN(16) w[32];
     struct ffts_dd2_t FFTS_ALIGN(16) h[32];
     struct ffts_dd2_t FFTS_ALIGN(16) sum;
-    int i, log_2, offset;
+    int i, log_2, offset, step;
 
     /* size must be a power of two */
     if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -322,22 +870,43 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
         goto mid_point;
     }
 
+    cst = (const ffts_double_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+    /* generate small tables from lookup table */
+    if (table_size <= 128) {
+        step = 128 / table_size;
+
+        for (i = 1; i < table_size/2; i++) {
+            double cosine = cst[4 * i * step + 0].d;
+            double sine   = cst[4 * i * step + 1].d;
+
+            table[i          + 0][0] = cosine;
+            table[i          + 0][1] = -sine;
+            table[table_size - i][0] = sine;
+            table[table_size - i][1] = -cosine;
+        }
+
+        goto mid_point;
+    }
+
     /* calculate table offset */
-    FFTS_ASSUME(table_size/2 > 1);
+    FFTS_ASSUME(table_size/2 > 64);
     log_2 = ffts_ctzl(table_size);
     FFTS_ASSUME(log_2 > 1);
     offset = 32 - log_2;
+    step = log_2 - 8;
     ct = (const struct ffts_dd2_t*)
-        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
-    hs = (const double*) &half_secant[4 * offset];
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+    hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
         w[i] = ct[i];
 
         /* duplicate the high and low parts */
-        h[i].hi = _mm_set1_pd(hs[2*i + 0]);
-        h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+        h[i].hi = _mm_set1_pd(hs[2*i + 0].d);
+        h[i].lo = _mm_set1_pd(hs[2*i + 1].d);
     }
 
     /* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -351,10 +920,23 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
         _mm_store_pd((double*) &table[table_size - i],
             _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
 
-        /* skip and find next trailing zero */
-        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-        sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]);
-        w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum);
+        /* use lookup table when possible */
+        if (log_2 > step) {
+            offset = ((2 * i) >> step) + (4 << (log_2 - step));
+            if (offset >= COS_SIN_TABLE_SIZE) {
+                offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4;
+                w[log_2].hi = _mm_loadr_pd(&cst[offset + 0].d);
+                w[log_2].lo = _mm_loadr_pd(&cst[offset + 2].d);
+            } else {
+                w[log_2].hi = _mm_load_pd(&cst[offset + 0].d);
+                w[log_2].lo = _mm_load_pd(&cst[offset + 2].d);
+            }
+        } else {
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]);
+            w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum);
+        }
     }
 
 mid_point:
@@ -369,9 +951,10 @@ int
 ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
 {
     const ffts_cpx_64f *FFTS_RESTRICT ct;
-    const double *FFTS_RESTRICT hs;
+    const ffts_double_t *FFTS_RESTRICT cst;
+    const ffts_double_t *FFTS_RESTRICT hs;
     ffts_cpx_64f FFTS_ALIGN(16) w[32];
-    int i, log_2, offset;
+    int i, log_2, offset, step;
 
     /* size must be a power of two */
     if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -392,14 +975,35 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
         goto mid_point;
     }
 
+    cst = (const ffts_double_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+    /* generate small tables from lookup table */
+    if (table_size <= 128) {
+        step = 128 / table_size;
+
+        for (i = 1; i < table_size/2; i++) {
+            float cosine = (float) cst[4 * i * step + 0].d;
+            float sine   = (float) cst[4 * i * step + 1].d;
+
+            table[i          + 0][0] = cosine;
+            table[i          + 0][1] = -sine;
+            table[table_size - i][0] = sine;
+            table[table_size - i][1] = -cosine;
+        }
+
+        goto mid_point;
+    }
+
     /* calculate table offset */
-    FFTS_ASSUME(table_size/2 > 1);
+    FFTS_ASSUME(table_size/2 > 64);
     log_2 = ffts_ctzl(table_size);
     FFTS_ASSUME(log_2 > 1);
     offset = 32 - log_2;
+    step = log_2 - 8;
     ct = (const ffts_cpx_64f*)
-        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
-    hs = (const double*) &half_secant[4 * offset];
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+    hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
@@ -417,10 +1021,23 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
         table[table_size - i][0] = (float)  w[log_2][1];
         table[table_size - i][1] = (float) -w[log_2][0];
 
-        /* skip and find next trailing zero */
-        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-        w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
-        w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+        /* use lookup table when possible */
+        if (log_2 > step) {
+            offset = ((2 * i) >> step) + (4 << (log_2 - step));
+            if (offset >= 260) {
+                offset = 260 - (2 << (log_2 - step)) - 4;
+                w[log_2][0] = cst[offset + 0].d;
+                w[log_2][1] = cst[offset + 1].d;
+            } else {
+                w[log_2][0] = cst[offset + 0].d;
+                w[log_2][1] = cst[offset + 1].d;
+            }
+        } else {
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+            w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
+        }
     }
 
 mid_point:
@@ -435,9 +1052,10 @@ int
 ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
 {
     const struct ffts_dd_t *FFTS_RESTRICT ct;
+    const ffts_double_t *FFTS_RESTRICT cst;
     const struct ffts_dd_t *FFTS_RESTRICT hs;
     struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
-    int i, log_2, offset;
+    int i, log_2, offset, step;
 
     /* size must be a power of two */
     if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -458,14 +1076,35 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
         goto mid_point;
     }
 
+    cst = (const ffts_double_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+    /* generate small tables from lookup table */
+    if (table_size <= 128) {
+        step = 128 / table_size;
+
+        for (i = 1; i < table_size/2; i++) {
+            double cosine = cst[4 * i * step + 0].d;
+            double sine   = cst[4 * i * step + 1].d;
+
+            table[i          + 0][0] = cosine;
+            table[i          + 0][1] = -sine;
+            table[table_size - i][0] = sine;
+            table[table_size - i][1] = -cosine;
+        }
+
+        goto mid_point;
+    }
+
     /* calculate table offset */
-    FFTS_ASSUME(table_size/2 > 1);
+    FFTS_ASSUME(table_size/2 > 64);
     log_2 = ffts_ctzl(table_size);
     FFTS_ASSUME(log_2 > 1);
     offset = 32 - log_2;
+    step = log_2 - 8;
     ct = (const struct ffts_dd_t*)
-        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
-    hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+    hs = (const struct ffts_dd_t*) &half_secant[2 * offset];
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
@@ -486,12 +1125,29 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
         table[table_size - i][0] =  w[log_2][1].hi;
         table[table_size - i][1] = -w[log_2][0].hi;
 
-        /* skip and find next trailing zero */
-        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-        w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
-            ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
-        w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
-            ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+        /* use lookup table when possible */
+        if (log_2 > step) {
+            offset = ((2 * i) >> step) + (4 << (log_2 - step));
+            if (offset >= 260) {
+                offset = 260 - (2 << (log_2 - step)) - 4;
+                w[log_2][0].hi = cst[offset + 1].d;
+                w[log_2][1].hi = cst[offset + 0].d;
+                w[log_2][0].lo = cst[offset + 3].d;
+                w[log_2][1].lo = cst[offset + 2].d;
+            } else {
+                w[log_2][0].hi = cst[offset + 0].d;
+                w[log_2][1].hi = cst[offset + 1].d;
+                w[log_2][0].lo = cst[offset + 2].d;
+                w[log_2][1].lo = cst[offset + 3].d;
+            }
+        } else {
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+                ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+            w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+                ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+        }
     }
 
 mid_point:
@@ -509,7 +1165,7 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
                                 int invert)
 {
     const ffts_cpx_64f *FFTS_RESTRICT ct;
-    const double *FFTS_RESTRICT hs;
+    const ffts_double_t *FFTS_RESTRICT hs;
     ffts_cpx_64f FFTS_ALIGN(16) w[32];
     int i, log_2, offset, N;
     float *A, *B;
@@ -547,8 +1203,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
     FFTS_ASSUME(log_2 > 2);
     offset = 34 - log_2;
     ct = (const ffts_cpx_64f*)
-        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
-    hs = (const double*) &half_secant[4 * offset];
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+    hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
@@ -556,7 +1212,6 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
         w[i][1] = ct[2*i][1];
     }
 
-    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
     if (sign < 0) {
         for (i = 1; i < N/4; i++) {
             float t0, t1, t2; 
@@ -580,8 +1235,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
 
             /* skip and find next trailing zero */
             offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-            w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
-            w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+            w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+            w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
         }
     } else {
         for (i = 1; i < N/4; i++) {
@@ -606,8 +1261,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
 
             /* skip and find next trailing zero */
             offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-            w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
-            w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+            w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+            w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
         }
     }
 
@@ -625,4 +1280,4 @@ last:
     }
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/lib/ffts/src/ffts_trig.h b/lib/ffts/src/ffts_trig.h
index 0b22738..f988340 100644
--- a/lib/ffts/src/ffts_trig.h
+++ b/lib/ffts/src/ffts_trig.h
@@ -2,7 +2,7 @@
 
 This file is part of FFTS -- The Fastest Fourier Transform in the South
 
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -39,8 +39,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "ffts_internal.h"
 
+/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
 int
-ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size);
+ffts_cexp_32f(size_t n, size_t d, float *output);
+
+int
+ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size);
+
+/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP */
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size);
 
 int
 ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
diff --git a/lib/ffts/src/macros-alpha.h b/lib/ffts/src/macros-alpha.h
index f7795d4..c32d1e9 100644
--- a/lib/ffts/src/macros-alpha.h
+++ b/lib/ffts/src/macros-alpha.h
@@ -58,9 +58,6 @@ typedef union {
     uint32_t u[4];
 } V4SF;
 
-#define FFTS_MALLOC(d,a) (malloc(d))
-#define FFTS_FREE(d) (free(d))
-
 static FFTS_ALWAYS_INLINE V4SF
 V4SF_LIT4(float f3, float f2, float f1, float f0)
 {
diff --git a/lib/ffts/src/macros-altivec.h b/lib/ffts/src/macros-altivec.h
index 28f552f..33f2346 100644
--- a/lib/ffts/src/macros-altivec.h
+++ b/lib/ffts/src/macros-altivec.h
@@ -4,6 +4,7 @@
   
  Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
  Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2019, Timothy Pearson <tpearson@raptorengineering.com>
  
  All rights reserved.
 
@@ -39,99 +40,89 @@
 
 #define restrict
 
-typedef vector float V;
+typedef vector float V4SF;
 typedef vector unsigned char VUC;
 
-#ifdef __apple__
-#define FFTS_MALLOC(d,a) vec_malloc(d)
-#define FFTS_FREE(d) vec_free(d)
-#else
-/* It appears vec_malloc() and friends are not implemented on Linux */
-#include <malloc.h>
-#define FFTS_MALLOC(d,a) memalign(16,d)
-#define FFTS_FREE(d) free(d)
-#endif
-
-#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+#define V4SF_LIT4(f0,f1,f2,f3) ((V4SF){f0, f1, f2, f3})
 
-#define VADD(x,y) vec_add(x,y)
-#define VSUB(x,y) vec_sub(x,y)
-#define VMUL(x,y) vec_madd(x,y,(V){0})
-#define VMULADD(x,y,z) vec_madd(x,y,z)
-#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
-#define VXOR(x,y) vec_xor((x),(y))
-#define VSWAPPAIRS(x)						\
+#define V4SF_ADD(x,y) vec_add(x,y)
+#define V4SF_SUB(x,y) vec_sub(x,y)
+#define V4SF_MUL(x,y) vec_madd(x,y,(V4SF){0})
+#define V4SF_MULADD(x,y,z) vec_madd(x,y,z)
+#define V4SF_NMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define V4SF_XOR(x,y) vec_xor((x),(y))
+#define V4SF_SWAPPAIRS(x)					\
     vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03,	\
 		       0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
 
-#define VBLEND(x,y)						\
+#define V4SF_BLEND(x,y)						\
     vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
 		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
 
-#define VUNPACKHI(x,y)						\
+#define V4SF_UNPACK_HI(x,y)					\
     vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,	\
 		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
 
-#define VUNPACKLO(x,y)						\
+#define V4SF_UNPACK_LO(x,y)					\
     vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
 		       0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
 
-#define VDUPRE(x)						\
+#define V4SF_DUPLICATE_RE(x)					\
     vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,	\
 		       0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
 
-#define VDUPIM(x)						\
+#define V4SF_DUPLICATE_IM(x)					\
     vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,	\
 		       0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
 
 
-static inline V IMUL(V d, V re, V im)
+static inline V4SF V4SF_IMUL(V4SF d, V4SF re, V4SF im)
 {
-    im = VMUL(im, VSWAPPAIRS(d));
-    re = VMUL(re, d);
-    return VSUB(re, im);  
+    im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+    re = V4SF_MUL(re, d);
+    return V4SF_SUB(re, im);  
 }
 
 
-static inline V IMULJ(V d, V re, V im)
+static inline V4SF V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
 {
-    im = VMUL(im, VSWAPPAIRS(d));
-    return VMULADD(re, d, im);
+    im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+    return V4SF_MULADD(re, d, im);
 }
 
 #ifndef __GNUC__
 /* gcc (4.6 and 4.7) ICEs on this code! */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
 {
-    return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+    return V4SF_XOR(x, inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f));
 }
 #else
 /* but compiles this fine... */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
 {
-    V t;
-    t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
-    return VXOR(x, t);
+    V4SF t;
+    t = inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f);
+    return V4SF_XOR(x, t);
 }
 #endif
 
 
-static inline V IMULI(int inv, V x)
+static inline V4SF V4SF_IMULI(int inv, V4SF x)
 {
-    return VSWAPPAIRS(MULI(inv, x));
+    return V4SF_SWAPPAIRS(MULI(inv, x));
 }
 
 
-static inline V VLD(const void *s)
+static inline V4SF V4SF_LD(const void *s)
 {
-    V *d = (V *)s;
+    V4SF *d = (V4SF *)s;
     return *d;
 }
 
 
-static inline void VST(void *d, V s)
+static inline void V4SF_ST(void *d, V4SF s)
 {
-    V *r = (V *)d;
+    V4SF *r = (V4SF *)d;
     *r = s;
 }
 #endif
diff --git a/lib/ffts/src/macros-neon.h b/lib/ffts/src/macros-neon.h
index 29aa49f..f0d1fff 100644
--- a/lib/ffts/src/macros-neon.h
+++ b/lib/ffts/src/macros-neon.h
@@ -39,9 +39,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdlib.h>
 #endif
 
-#define FFTS_MALLOC(d,a) (valloc(d))
-#define FFTS_FREE(d) (free(d))
-
 typedef float32x4_t   V4SF;
 typedef float32x4x2_t V4SF2;
 
diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h
index 827aa67..46e1f29 100644
--- a/lib/ffts/src/macros-sse.h
+++ b/lib/ffts/src/macros-sse.h
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
 
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <xmmintrin.h>
 
-#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
-#define FFTS_FREE(d) (_mm_free(d))
-
 typedef __m128 V4SF;
 
 #define V4SF_ADD  _mm_add_ps
@@ -56,8 +54,9 @@ typedef __m128 V4SF;
 #define V4SF_SWAP_PAIRS(x) \
     (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)))
 
+/* note: order is swapped */
 #define V4SF_UNPACK_HI(x,y) \
-    (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2)))
+    (_mm_movehl_ps(y, x))
 
 #define V4SF_UNPACK_LO(x,y) \
     (_mm_movelh_ps(x, y))
@@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
     return V4SF_ADD(re, im);
 }
 
+#ifdef FFTS_DOUBLE
+typedef union {
+    struct {
+        double r1;
+        double i1;
+        double r2;
+        double i2;
+    } r;
+    uint32_t u[8];
+} V4DF;
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LIT4(double f3, double f2, double f1, double f0)
+{
+    V4DF z;
+
+    z.r.r1 = f0;
+    z.r.i1 = f1;
+    z.r.r2 = f2;
+    z.r.i2 = f3;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_ADD(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 + y.r.r1;
+    z.r.i1 = x.r.i1 + y.r.i1;
+    z.r.r2 = x.r.r2 + y.r.r2;
+    z.r.i2 = x.r.i2 + y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SUB(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 - y.r.r1;
+    z.r.i1 = x.r.i1 - y.r.i1;
+    z.r.r2 = x.r.r2 - y.r.r2;
+    z.r.i2 = x.r.i2 - y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MUL(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 * y.r.r1;
+    z.r.i1 = x.r.i1 * y.r.i1;
+    z.r.r2 = x.r.r2 * y.r.r2;
+    z.r.i2 = x.r.i2 * y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_XOR(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.u[0] = x.u[0] ^ y.u[0];
+    z.u[1] = x.u[1] ^ y.u[1];
+    z.u[2] = x.u[2] ^ y.u[2];
+    z.u[3] = x.u[3] ^ y.u[3];
+    z.u[4] = x.u[4] ^ y.u[4];
+    z.u[5] = x.u[5] ^ y.u[5];
+    z.u[6] = x.u[6] ^ y.u[6];
+    z.u[7] = x.u[7] ^ y.u[7];
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SWAP_PAIRS(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.r2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_BLEND(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_HI(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r2;
+    z.r.i1 = x.r.i2;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_LO(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r1;
+    z.r.i2 = y.r.i1;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_RE(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.r2;
+    z.r.i2 = x.r.r2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_IM(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMUL(V4DF d, V4DF re, V4DF im)
+{
+    re = V4DF_MUL(re, d);
+    im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+    return V4DF_SUB(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULJ(V4DF d, V4DF re, V4DF im)
+{
+    re = V4DF_MUL(re, d);
+    im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+    return V4DF_ADD(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MULI(int inv, V4DF x)
+{
+    V4DF z;
+
+    if (inv) {
+        z.r.r1 = -x.r.r1;
+        z.r.i1 =  x.r.i1;
+        z.r.r2 = -x.r.r2;
+        z.r.i2 =  x.r.i2;
+    } else {
+        z.r.r1 =  x.r.r1;
+        z.r.i1 = -x.r.i1;
+        z.r.r2 =  x.r.r2;
+        z.r.i2 = -x.r.i2;
+    }
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULI(int inv, V4DF x)
+{
+    return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x));
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LD(const void *s)
+{
+    V4DF z;
+    memcpy(&z, s, sizeof(z));
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE void
+V4DF_ST(void *d, V4DF s)
+{
+    V4DF *r = (V4DF*) d;
+    *r = s;
+}
+#endif
+
 #endif /* FFTS_MACROS_SSE_H */
diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h
index e7e349f..99b0c53 100644
--- a/lib/ffts/src/macros.h
+++ b/lib/ffts/src/macros.h
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
 
 Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
 Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 
 All rights reserved.
 
@@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef HAVE_NEON
 #include "macros-neon.h"
 #elif HAVE_SSE
+#ifdef HAVE_AVX
+#include "macros-avx.h"
+#else
 #include "macros-sse.h"
+#endif
 // NOTE: AltiVec support disabled until updated to provide new V4SF variable type
-//#elif __powerpc__
-//#include "macros-altivec.h"
+#elif __powerpc__
+#include "macros-altivec.h"
 #else
 #include "macros-alpha.h"
 #endif
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_TX2(V4DF *a, V4DF *b)
+{
+    V4DF t0 = V4DF_UNPACK_LO(*a, *b);
+    V4DF t1 = V4DF_UNPACK_HI(*a, *b);
+    *a = t0;
+    *b = t1;
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_TX2(V4SF *a, V4SF *b)
 {
@@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b)
     *b = t1;
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_N(int inv,
+         V4DF re,
+         V4DF im,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF uk, uk2, zk_p, zk_n, zk, zk_d;
+
+    uk  = *r0;
+    uk2 = *r1;
+
+    zk_p = V4DF_IMUL(*r2, re, im);
+    zk_n = V4DF_IMULJ(*r3, re, im);
+
+    zk   = V4DF_ADD(zk_p, zk_n);
+    zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n));
+
+    *r2 = V4DF_SUB(uk, zk);
+    *r0 = V4DF_ADD(uk, zk);
+    *r3 = V4DF_ADD(uk2, zk_d);
+    *r1 = V4DF_SUB(uk2, zk_d);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_K_N(int inv,
          V4SF re,
@@ -84,6 +128,45 @@ V4SF_K_N(int inv,
     *r1 = V4SF_SUB(uk2, zk_d);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2_4(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_SUB(t2, t3);
+
+    *r0 = V4DF_UNPACK_LO(t4, t5);
+    *r1 = V4DF_UNPACK_LO(t6, t7);
+
+    t5 = V4DF_IMULI(inv, t5);
+
+    t0 = V4DF_ADD(t6, t4);
+    t2 = V4DF_SUB(t6, t4);
+    t1 = V4DF_SUB(t7, t5);
+    t3 = V4DF_ADD(t7, t5);
+
+    *r3 = V4DF_UNPACK_HI(t0, t1);
+    *r2 = V4DF_UNPACK_HI(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_2_4(int inv,
            const float *FFTS_RESTRICT i0,
@@ -121,6 +204,46 @@ V4SF_L_2_4(int inv,
     *r2 = V4SF_UNPACK_HI(t2, t3);
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_4(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+
+    t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+    t0 = V4DF_ADD(t4, t6);
+    t2 = V4DF_SUB(t4, t6);
+    t1 = V4DF_SUB(t5, t7);
+    t3 = V4DF_ADD(t5, t7);
+
+    V4DF_TX2(&t0, &t1);
+    V4DF_TX2(&t2, &t3);
+
+    *r0 = t0;
+    *r2 = t1;
+    *r1 = t2;
+    *r3 = t3;
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4_4(int inv,
            const float *FFTS_RESTRICT i0,
@@ -159,6 +282,48 @@ V4SF_L_4_4(int inv,
     *r3 = t3;
 }
 
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_2(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t6 = V4DF_LD(i2);
+    t7 = V4DF_LD(i3);
+
+    t2 = V4DF_BLEND(t6, t7);
+    t3 = V4DF_BLEND(t7, t6);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_SUB(t2, t3);
+
+    *r2 = V4DF_UNPACK_HI(t4, t5);
+    *r3 = V4DF_UNPACK_HI(t6, t7);
+
+    t7 = V4DF_IMULI(inv, t7);
+
+    t0 = V4DF_ADD(t4, t6);
+    t2 = V4DF_SUB(t4, t6);
+    t1 = V4DF_SUB(t5, t7);
+    t3 = V4DF_ADD(t5, t7);
+
+    *r0 = V4DF_UNPACK_LO(t0, t1);
+    *r1 = V4DF_UNPACK_LO(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4_2(int inv,
            const float *FFTS_RESTRICT i0,
@@ -199,6 +364,9 @@ V4SF_L_4_2(int inv,
     *r1 = V4SF_UNPACK_LO(t2, t3);
 }
 
+#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+    V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3);
+
 #define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
     V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);
 
-- 
cgit v1.2.1