From 2ef6dba8728db2437def9a4fc1d3e20e0aa44c31 Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Sat, 24 Aug 2024 13:04:45 -0500 Subject: Revup FFTS to latest upstream version Taken from https://github.com/linkotec/ffts Fixes ppc64el support and a handful of other bugs --- lib/ffts/CMakeLists.txt | 158 +++++- lib/ffts/config.guess | 463 ++++++++++-------- lib/ffts/config.sub | 139 +++--- lib/ffts/ffts.pc.cmake.in | 6 +- lib/ffts/include/ffts.h | 4 + lib/ffts/src/Makefile.am | 4 +- lib/ffts/src/codegen.c | 4 +- lib/ffts/src/codegen_sse.h | 64 +-- lib/ffts/src/ffts.c | 292 +++++++++++- lib/ffts/src/ffts_chirp_z.c | 225 +++++++++ lib/ffts/src/ffts_chirp_z.h | 45 ++ lib/ffts/src/ffts_cpu.c | 371 +++++++++++++++ lib/ffts/src/ffts_cpu.h | 54 +++ lib/ffts/src/ffts_internal.h | 123 ++++- lib/ffts/src/ffts_real.c | 218 +++++++-- lib/ffts/src/ffts_static.c | 586 +++++++++++++++++++++-- lib/ffts/src/ffts_static.h | 24 + lib/ffts/src/ffts_trig.c | 1057 +++++++++++++++++++++++++++++++++-------- lib/ffts/src/ffts_trig.h | 12 +- lib/ffts/src/macros-alpha.h | 3 - lib/ffts/src/macros-altivec.h | 77 ++- lib/ffts/src/macros-neon.h | 3 - lib/ffts/src/macros-sse.h | 223 ++++++++- lib/ffts/src/macros.h | 172 ++++++- 24 files changed, 3620 insertions(+), 707 deletions(-) create mode 100644 lib/ffts/src/ffts_chirp_z.c create mode 100644 lib/ffts/src/ffts_chirp_z.h create mode 100644 lib/ffts/src/ffts_cpu.c create mode 100644 lib/ffts/src/ffts_cpu.h diff --git a/lib/ffts/CMakeLists.txt b/lib/ffts/CMakeLists.txt index 459655e..748f412 100644 --- a/lib/ffts/CMakeLists.txt +++ b/lib/ffts/CMakeLists.txt @@ -7,7 +7,7 @@ set(FFTS_MAJOR 0) set(FFTS_MINOR 9) set(FFTS_MICRO 0) -set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}") +set(FFTS_VERSION "${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}") set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) set_property(GLOBAL PROPERTY USE_FOLDERS ON) @@ -22,6 +22,16 @@ set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include/ffts) set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib) # common options + +# !!!! FOR TESTING ONLY !!!! +option(ENABLE_AVX + "Enables AVX instructions." OFF +) +# !!!! FOR TESTING ONLY !!!! +option(ENABLE_DOUBLE + "Enables double precision" OFF +) + option(ENABLE_NEON "Enables the use of NEON instructions." OFF ) @@ -48,24 +58,36 @@ option(ENABLE_STATIC include(CheckCSourceCompiles) include(CheckCSourceRuns) +include(CheckFunctionExists) include(CheckIncludeFile) +include(CheckSymbolExists) # Ensure defined when building FFTS (as opposed to using it from # another project). Used to export functions from Windows DLL. add_definitions(-DFFTS_BUILD) # check existence of various headers -check_include_file(malloc.h HAVE_MALLOC_H) -check_include_file(stdint.h HAVE_STDINT_H) -check_include_file(stdlib.h HAVE_STDLIB_H) -check_include_file(string.h HAVE_STRING_H) -check_include_file(sys/mman.h HAVE_SYS_MMAN_H) -check_include_file(unistd.h HAVE_UNISTD_H) +check_include_file(inttypes.h HAVE_INTTYPES_H) +check_include_file(malloc.h HAVE_MALLOC_H) +check_include_file(mm_malloc.h HAVE_MM_MALLOC_H) +check_include_file(stdint.h HAVE_STDINT_H) +check_include_file(stdlib.h HAVE_STDLIB_H) +check_include_file(string.h HAVE_STRING_H) +check_include_file(sys/mman.h HAVE_SYS_MMAN_H) +check_include_file(unistd.h HAVE_UNISTD_H) + +if(HAVE_INTTYPES_H) + add_definitions(-DHAVE_INTTYPES_H) +endif(HAVE_INTTYPES_H) if(HAVE_MALLOC_H) add_definitions(-DHAVE_MALLOC_H) endif(HAVE_MALLOC_H) +if(HAVE_MM_MALLOC_H) + add_definitions(-DHAVE_MM_MALLOC_H) +endif(HAVE_MM_MALLOC_H) + if(HAVE_STDINT_H) add_definitions(-DHAVE_STDINT_H) endif(HAVE_STDINT_H) @@ -86,6 +108,50 @@ if(HAVE_UNISTD_H) add_definitions(-DHAVE_UNISTD_H) endif(HAVE_UNISTD_H) +# check existence of various declarations +check_symbol_exists(memalign malloc.h HAVE_DECL_MEMALIGN) +check_symbol_exists(posix_memalign stdlib.h HAVE_DECL_POSIX_MEMALIGN) +check_symbol_exists(valloc stdlib.h HAVE_DECL_VALLOC) +check_symbol_exists(_mm_malloc malloc.h HAVE_DECL__MM_MALLOC) + +if(HAVE_DECL_MEMALIGN) + add_definitions(-DHAVE_DECL_MEMALIGN) +endif(HAVE_DECL_MEMALIGN) + +if(HAVE_DECL_POSIX_MEMALIGN) + add_definitions(-DHAVE_DECL_POSIX_MEMALIGN) +endif(HAVE_DECL_POSIX_MEMALIGN) + +if(HAVE_DECL_VALLOC) + add_definitions(-DHAVE_DECL_VALLOC) +endif(HAVE_DECL_VALLOC) + +if(HAVE_DECL__MM_MALLOC) + add_definitions(-DHAVE_DECL__MM_MALLOC) +endif(HAVE_DECL__MM_MALLOC) + +# check existence of various functions +check_function_exists(memalign HAVE_MEMALIGN) +check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN) +check_function_exists(valloc HAVE_VALLOC) +check_function_exists(_mm_malloc HAVE__MM_MALLOC) + +if(HAVE_MEMALIGN) + add_definitions(-DHAVE_MEMALIGN) +endif(HAVE_MEMALIGN) + +if(HAVE_POSIX_MEMALIGN) + add_definitions(-DHAVE_POSIX_MEMALIGN) +endif(HAVE_POSIX_MEMALIGN) + +if(HAVE_VALLOC) + add_definitions(-DHAVE_VALLOC) +endif(HAVE_VALLOC) + +if(HAVE__MM_MALLOC) + add_definitions(-DHAVE__MM_MALLOC) +endif(HAVE__MM_MALLOC) + # backup flags set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) @@ -246,6 +312,14 @@ if(NOT CMAKE_CROSSCOMPILING) if(HAVE_XMMINTRIN_H) add_definitions(-DHAVE_SSE) set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + + # TODO: not the right place + if(ENABLE_AVX) + add_definitions(-DHAVE_AVX) + endif(ENABLE_AVX) + if(ENABLE_DOUBLE) + add_definitions(-DFFTS_DOUBLE) + endif(ENABLE_DOUBLE) endif(HAVE_XMMINTRIN_H) # enable SSE2 code generation @@ -351,6 +425,10 @@ set(FFTS_HEADERS set(FFTS_SOURCES src/ffts_attributes.h src/ffts.c + src/ffts_chirp_z.c + src/ffts_chirp_z.h + src/ffts_cpu.c + src/ffts_cpu.h src/ffts_internal.h src/ffts_nd.c src/ffts_nd.h @@ -369,6 +447,17 @@ set(FFTS_SOURCES src/types.h ) +if(NOT DISABLE_DYNAMIC_CODE) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + list(APPEND FFTS_SOURCES + src/codegen_sse.h + ) + else() + message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.") + set(DISABLE_DYNAMIC_CODE ON) + endif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") +endif(NOT DISABLE_DYNAMIC_CODE) + if(ENABLE_NEON) list(APPEND FFTS_SOURCES src/neon.s @@ -393,19 +482,9 @@ elseif(HAVE_XMMINTRIN_H) add_definitions(-DHAVE_SSE) list(APPEND FFTS_SOURCES + src/macros-avx.h src/macros-sse.h ) - - if(NOT DISABLE_DYNAMIC_CODE) - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - list(APPEND FFTS_SOURCES - src/codegen_sse.h - ) - else() - message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.") - set(DISABLE_DYNAMIC_CODE ON) - endif(CMAKE_SIZEOF_VOID_P EQUAL 8) - endif(NOT DISABLE_DYNAMIC_CODE) endif(ENABLE_NEON) if(DISABLE_DYNAMIC_CODE) @@ -452,6 +531,41 @@ if(ENABLE_STATIC) endif(ENABLE_STATIC) if(ENABLE_STATIC OR ENABLE_SHARED) + find_path(MPFR_INCLUDES + NAMES mpfr.h + PATHS ${INCLUDE_INSTALL_DIR} + ) + find_library(MPFR_LIBRARIES mpfr PATHS ${LIB_INSTALL_DIR}) + find_package(OpenMP) + + if(MPFR_INCLUDES) + add_definitions(-DHAVE_MPFR_H) + include_directories(${MPFR_INCLUDES}) + endif(MPFR_INCLUDES) + + add_executable(ffts_trig_test + tests/trig_test.c + ) + + target_link_libraries(ffts_trig_test ffts) + if(MPFR_LIBRARIES) + target_link_libraries(ffts_trig_test ${MPFR_LIBRARIES}) + endif(MPFR_LIBRARIES) + + if(OPENMP_FOUND) + if(MSVC) + set_target_properties(ffts_trig_test PROPERTIES + COMPILE_FLAGS "${OpenMP_C_FLAGS}" + LINK_FLAGS "${OpenMP_EXE_LINKER_FLAGS}" + ) + else() + set_target_properties(ffts_trig_test PROPERTIES + COMPILE_FLAGS "${OpenMP_C_FLAGS}" + LINK_FLAGS "${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}" + ) + endif(MSVC) + endif(OPENMP_FOUND) + add_executable(ffts_test tests/test.c ) @@ -467,6 +581,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED) ffts ${FFTS_EXTRA_LIBRARIES} ) + + add_executable(ffts_cpu_test + src/ffts_cpu.c + src/ffts_cpu.h + tests/cpu_test.c + ) + + set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST) endif(ENABLE_STATIC OR ENABLE_SHARED) # generate packageconfig file diff --git a/lib/ffts/config.guess b/lib/ffts/config.guess index 0967f2a..137bedf 100755 --- a/lib/ffts/config.guess +++ b/lib/ffts/config.guess @@ -1,12 +1,14 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright 1992-2016 Free Software Foundation, Inc. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012 Free Software Foundation, Inc. -timestamp='2016-04-02' +timestamp='2012-08-14' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or +# the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but @@ -20,17 +22,19 @@ timestamp='2016-04-02' # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that -# program. This Exception is an additional permission under section 7 -# of the GNU General Public License, version 3 ("GPLv3"). +# the same distribution terms that you use for the rest of that program. + + +# Originally written by Per Bothner. Please send patches (context +# diff format) to and include a ChangeLog +# entry. # -# Originally written by Per Bothner; maintained since 2000 by Ben Elliston. +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. # # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess -# -# Please send patches to . - +# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD me=`echo "$0" | sed -e 's,.*/,,'` @@ -50,7 +54,9 @@ version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright 1992-2016 Free Software Foundation, Inc. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 +Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -132,27 +138,6 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown -case "${UNAME_SYSTEM}" in -Linux|GNU|GNU/*) - # If the system lacks a compiler, then just pick glibc. - # We could probably try harder. - LIBC=gnu - - eval $set_cc_for_build - cat <<-EOF > $dummy.c - #include - #if defined(__UCLIBC__) - LIBC=uclibc - #elif defined(__dietlibc__) - LIBC=dietlibc - #else - LIBC=gnu - #endif - EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` - ;; -esac - # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in @@ -168,27 +153,20 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" - UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ - /sbin/$sysctl 2>/dev/null || \ - /usr/sbin/$sysctl 2>/dev/null || \ - echo unknown)` + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; - earmv*) - arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'` - endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'` - machine=${arch}${endian}-unknown - ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently, or will in the future. case "${UNAME_MACHINE_ARCH}" in - arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax) + arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ @@ -204,13 +182,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in os=netbsd ;; esac - # Determine ABI tags. - case "${UNAME_MACHINE_ARCH}" in - earm*) - expr='s/^earmv[0-9]/-eabi/;s/eb$//' - abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"` - ;; - esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need @@ -221,13 +192,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in release='-gnu' ;; *) - release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2` + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "${machine}-${os}${release}${abi}" + echo "${machine}-${os}${release}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` @@ -237,10 +208,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} exit ;; - *:LibertyBSD:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` - echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE} - exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; @@ -253,9 +220,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:MirBSD:*:*) echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} exit ;; - *:Sortix:*:*) - echo ${UNAME_MACHINE}-unknown-sortix - exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) @@ -272,42 +236,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") - UNAME_MACHINE=alpha ;; + UNAME_MACHINE="alpha" ;; "EV4.5 (21064)") - UNAME_MACHINE=alpha ;; + UNAME_MACHINE="alpha" ;; "LCA4 (21066/21068)") - UNAME_MACHINE=alpha ;; + UNAME_MACHINE="alpha" ;; "EV5 (21164)") - UNAME_MACHINE=alphaev5 ;; + UNAME_MACHINE="alphaev5" ;; "EV5.6 (21164A)") - UNAME_MACHINE=alphaev56 ;; + UNAME_MACHINE="alphaev56" ;; "EV5.6 (21164PC)") - UNAME_MACHINE=alphapca56 ;; + UNAME_MACHINE="alphapca56" ;; "EV5.7 (21164PC)") - UNAME_MACHINE=alphapca57 ;; + UNAME_MACHINE="alphapca57" ;; "EV6 (21264)") - UNAME_MACHINE=alphaev6 ;; + UNAME_MACHINE="alphaev6" ;; "EV6.7 (21264A)") - UNAME_MACHINE=alphaev67 ;; + UNAME_MACHINE="alphaev67" ;; "EV6.8CB (21264C)") - UNAME_MACHINE=alphaev68 ;; + UNAME_MACHINE="alphaev68" ;; "EV6.8AL (21264B)") - UNAME_MACHINE=alphaev68 ;; + UNAME_MACHINE="alphaev68" ;; "EV6.8CX (21264D)") - UNAME_MACHINE=alphaev68 ;; + UNAME_MACHINE="alphaev68" ;; "EV6.9A (21264/EV69A)") - UNAME_MACHINE=alphaev69 ;; + UNAME_MACHINE="alphaev69" ;; "EV7 (21364)") - UNAME_MACHINE=alphaev7 ;; + UNAME_MACHINE="alphaev7" ;; "EV7.9 (21364A)") - UNAME_MACHINE=alphaev79 ;; + UNAME_MACHINE="alphaev79" ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. - echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 @@ -342,7 +306,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; - arm*:riscos:*:*|arm*:RISCOS:*:*) + arm:riscos:*:*|arm:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) @@ -380,16 +344,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build - SUN_ARCH=i386 + SUN_ARCH="i386" # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then - SUN_ARCH=x86_64 + SUN_ARCH="x86_64" fi fi echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` @@ -414,7 +378,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` - test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3 + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} @@ -600,9 +564,8 @@ EOF else IBM_ARCH=powerpc fi - if [ -x /usr/bin/lslpp ] ; then - IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | - awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi @@ -639,13 +602,13 @@ EOF sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in - 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 - 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in - 32) HP_ARCH=hppa2.0n ;; - 64) HP_ARCH=hppa2.0w ;; - '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 esac ;; esac fi @@ -684,11 +647,11 @@ EOF exit (0); } EOF - (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac - if [ ${HP_ARCH} = hppa2.0w ] + if [ ${HP_ARCH} = "hppa2.0w" ] then eval $set_cc_for_build @@ -701,12 +664,12 @@ EOF # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 - if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then - HP_ARCH=hppa2.0w + HP_ARCH="hppa2.0w" else - HP_ARCH=hppa64 + HP_ARCH="hppa64" fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} @@ -811,14 +774,14 @@ EOF echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) - FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` - FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) @@ -848,7 +811,7 @@ EOF *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; - *:MSYS*:*) + i*:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; i*:windows32*:*) @@ -896,21 +859,21 @@ EOF exit ;; *:GNU:*:*) # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in @@ -923,60 +886,59 @@ EOF EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC=gnulibc1 ; fi - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} - exit ;; - arc:Linux:*:* | arceb:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then - echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi + echo ${UNAME_MACHINE}-unknown-linux-gnueabi else - echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf + echo ${UNAME_MACHINE}-unknown-linux-gnueabihf fi fi exit ;; avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; cris:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-${LIBC} + echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; crisv32:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-${LIBC} - exit ;; - e2k:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; frv:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; hexagon:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:Linux:*:*) - echo ${UNAME_MACHINE}-pc-linux-${LIBC} + LIBC=gnu + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" exit ;; ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} - exit ;; - k1om:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build @@ -995,63 +957,54 @@ EOF #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; } + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; - openrisc*:Linux:*:*) - echo or1k-unknown-linux-${LIBC} - exit ;; - or32:Linux:*:* | or1k*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + or32:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; padre:Linux:*:*) - echo sparc-unknown-linux-${LIBC} + echo sparc-unknown-linux-gnu exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-${LIBC} + echo hppa64-unknown-linux-gnu exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-${LIBC} ;; - PA8*) echo hppa2.0-unknown-linux-${LIBC} ;; - *) echo hppa-unknown-linux-${LIBC} ;; + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; esac exit ;; ppc64:Linux:*:*) - echo powerpc64-unknown-linux-${LIBC} + echo powerpc64-unknown-linux-gnu exit ;; ppc:Linux:*:*) - echo powerpc-unknown-linux-${LIBC} - exit ;; - ppc64le:Linux:*:*) - echo powerpc64le-unknown-linux-${LIBC} - exit ;; - ppcle:Linux:*:*) - echo powerpcle-unknown-linux-${LIBC} + echo powerpc-unknown-linux-gnu exit ;; s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux-${LIBC} + echo ${UNAME_MACHINE}-ibm-linux exit ;; sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; tile*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-${LIBC} + echo ${UNAME_MACHINE}-dec-linux-gnu exit ;; x86_64:Linux:*:*) - echo ${UNAME_MACHINE}-pc-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. @@ -1127,7 +1080,7 @@ EOF # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub - # prints for the "djgpp" host, or else GDB configure will decide that + # prints for the "djgpp" host, or else GDB configury will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; @@ -1276,9 +1229,6 @@ EOF SX-8R:SUPER-UX:*:*) echo sx8r-nec-superux${UNAME_RELEASE} exit ;; - SX-ACE:SUPER-UX:*:*) - echo sxace-nec-superux${UNAME_RELEASE} - exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; @@ -1287,36 +1237,24 @@ EOF exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - eval $set_cc_for_build - if test "$UNAME_PROCESSOR" = unknown ; then - UNAME_PROCESSOR=powerpc - fi - if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - case $UNAME_PROCESSOR in - i386) UNAME_PROCESSOR=x86_64 ;; - powerpc) UNAME_PROCESSOR=powerpc64 ;; - esac - fi - fi - elif test "$UNAME_PROCESSOR" = i386 ; then - # Avoid executing cc on OS X 10.9, as it ships with a stub - # that puts up a graphical alert prompting to install - # developer tools. Any system running Mac OS X 10.7 or - # later (Darwin 11 and later) is required to have a 64-bit - # processor. This is not true of the ARM version of Darwin - # that Apple uses in portable devices. - UNAME_PROCESSOR=x86_64 - fi + case $UNAME_PROCESSOR in + i386) + eval $set_cc_for_build + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + UNAME_PROCESSOR="x86_64" + fi + fi ;; + unknown) UNAME_PROCESSOR=powerpc ;; + esac echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` - if test "$UNAME_PROCESSOR" = x86; then + if test "$UNAME_PROCESSOR" = "x86"; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi @@ -1347,7 +1285,7 @@ EOF # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. - if test "$cputype" = 386; then + if test "$cputype" = "386"; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" @@ -1389,7 +1327,7 @@ EOF echo i386-pc-xenix exit ;; i*86:skyos:*:*) - echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'` + echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' exit ;; i*86:rdos:*:*) echo ${UNAME_MACHINE}-pc-rdos @@ -1400,11 +1338,156 @@ EOF x86_64:VMkernel:*:*) echo ${UNAME_MACHINE}-unknown-esx exit ;; - amd64:Isilon\ OneFS:*:*) - echo x86_64-unknown-onefs - exit ;; esac +eval $set_cc_for_build +cat >$dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix\n"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + c34*) + echo c34-convex-bsd + exit ;; + c38*) + echo c38-convex-bsd + exit ;; + c4*) + echo c4-convex-bsd + exit ;; + esac +fi + cat >&2 <. @@ -20,12 +26,11 @@ timestamp='2016-03-30' # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that -# program. This Exception is an additional permission under section 7 -# of the GNU General Public License, version 3 ("GPLv3"). +# the same distribution terms that you use for the rest of that program. -# Please send patches to . +# Please send patches to . Submit a context +# diff and a properly formatted GNU ChangeLog entry. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. @@ -33,7 +38,7 @@ timestamp='2016-03-30' # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub +# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases @@ -53,7 +58,8 @@ timestamp='2016-03-30' me=`echo "$0" | sed -e 's,.*/,,'` usage="\ -Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS +Usage: $0 [OPTION] CPU-MFR-OPSYS + $0 [OPTION] ALIAS Canonicalize a configuration name. @@ -67,7 +73,9 @@ Report bugs and patches to ." version="\ GNU config.sub ($timestamp) -Copyright 1992-2016 Free Software Foundation, Inc. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 +Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -116,7 +124,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ - knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \ + knetbsd*-gnu* | netbsd*-gnu* | \ kopensolaris*-gnu* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os @@ -148,7 +156,7 @@ case $os in -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ - -apple | -axis | -knuth | -cray | -microblaze*) + -apple | -axis | -knuth | -cray | -microblaze) os= basic_machine=$1 ;; @@ -251,25 +259,21 @@ case $basic_machine in | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ - | arc | arceb \ - | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ - | avr | avr32 \ - | ba \ - | be32 | be64 \ + | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ + | be32 | be64 \ | bfin \ - | c4x | c8051 | clipper \ + | c4x | clipper \ | d10v | d30v | dlx | dsp16xx \ - | e2k | epiphany \ - | fido | fr30 | frv | ft32 \ + | epiphany \ + | fido | fr30 | frv \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ - | k1om \ | le32 | le64 \ | lm32 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ - | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ + | maxq | mb | microblaze | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ @@ -283,29 +287,26 @@ case $basic_machine in | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ - | mipsisa32r6 | mipsisa32r6el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ - | mipsisa64r6 | mipsisa64r6el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ - | mipsr5900 | mipsr5900el \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ - | nios | nios2 | nios2eb | nios2el \ + | nios | nios2 \ | ns16k | ns32k \ - | open8 | or1k | or1knd | or32 \ + | open8 \ + | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ - | riscv32 | riscv64 \ | rl78 | rx \ | score \ - | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ + | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ @@ -313,7 +314,6 @@ case $basic_machine in | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ - | visium \ | we32k \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) @@ -328,10 +328,7 @@ case $basic_machine in c6x) basic_machine=tic6x-unknown ;; - leon|leon[3-9]) - basic_machine=sparc-$basic_machine - ;; - m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip) + m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip) basic_machine=$basic_machine-unknown os=-none ;; @@ -373,29 +370,26 @@ case $basic_machine in | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ - | ba-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ - | c8051-* | clipper-* | craynv-* | cydra-* \ + | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ - | e2k-* | elxsi-* \ + | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ - | k1om-* \ | le32-* | le64-* \ | lm32-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ - | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ - | microblaze-* | microblazeel-* \ + | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ @@ -409,33 +403,28 @@ case $basic_machine in | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ - | mipsisa32r6-* | mipsisa32r6el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ - | mipsisa64r6-* | mipsisa64r6el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ - | mipsr5900-* | mipsr5900el-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ - | nios-* | nios2-* | nios2eb-* | nios2el-* \ + | nios-* | nios2-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ - | or1k*-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ | pyramid-* \ - | riscv32-* | riscv64-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ - | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \ + | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ @@ -443,7 +432,6 @@ case $basic_machine in | ubicom32-* \ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ - | visium-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ @@ -520,9 +508,6 @@ case $basic_machine in basic_machine=i386-pc os=-aros ;; - asmjs) - basic_machine=asmjs-unknown - ;; aux) basic_machine=m68k-apple os=-aux @@ -784,9 +769,6 @@ case $basic_machine in basic_machine=m68k-isi os=-sysv ;; - leon-*|leon[3-9]-*) - basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'` - ;; m68knommu) basic_machine=m68k-unknown os=-linux @@ -806,7 +788,7 @@ case $basic_machine in basic_machine=ns32k-utek os=-sysv ;; - microblaze*) + microblaze) basic_machine=microblaze-xilinx ;; mingw64) @@ -814,7 +796,7 @@ case $basic_machine in os=-mingw64 ;; mingw32) - basic_machine=i686-pc + basic_machine=i386-pc os=-mingw32 ;; mingw32ce) @@ -842,10 +824,6 @@ case $basic_machine in basic_machine=powerpc-unknown os=-morphos ;; - moxiebox) - basic_machine=moxie-unknown - os=-moxiebox - ;; msdos) basic_machine=i386-pc os=-msdos @@ -854,7 +832,7 @@ case $basic_machine in basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) - basic_machine=i686-pc + basic_machine=i386-pc os=-msys ;; mvs) @@ -1045,11 +1023,7 @@ case $basic_machine in basic_machine=i586-unknown os=-pw32 ;; - rdos | rdos64) - basic_machine=x86_64-pc - os=-rdos - ;; - rdos32) + rdos) basic_machine=i386-pc os=-rdos ;; @@ -1376,13 +1350,13 @@ case $os in -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ - | -sym* | -kopensolaris* | -plan9* \ + | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ - | -aos* | -aros* | -cloudabi* | -sortix* \ + | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ - | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \ + | -bitrig* | -openbsd* | -solidbsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ @@ -1391,15 +1365,14 @@ case $os in | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ - | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \ + | -uxpv* | -beos* | -mpeix* | -udk* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ - | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \ - | -onefs* | -tirtos*) + | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) @@ -1523,6 +1496,9 @@ case $os in -aros*) os=-aros ;; + -kaos*) + os=-kaos + ;; -zvmoe) os=-zvmoe ;; @@ -1531,8 +1507,6 @@ case $os in ;; -nacl*) ;; - -ios) - ;; -none) ;; *) @@ -1573,9 +1547,6 @@ case $basic_machine in c4x-* | tic4x-*) os=-coff ;; - c8051-*) - os=-elf - ;; hexagon-*) os=-elf ;; diff --git a/lib/ffts/ffts.pc.cmake.in b/lib/ffts/ffts.pc.cmake.in index 43f38e9..63d4cc0 100644 --- a/lib/ffts/ffts.pc.cmake.in +++ b/lib/ffts/ffts.pc.cmake.in @@ -1,7 +1,7 @@ prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=${exec_prefix} -libdir=${libdir} -includedir=${includedir} +exec_prefix=${prefix} +libdir=${exec_prefix}/lib +includedir=${prefix}/include Name: @CMAKE_PROJECT_NAME@ Description: fast Fourier transform library diff --git a/lib/ffts/include/ffts.h b/lib/ffts/include/ffts.h index cc85a88..b13316f 100644 --- a/lib/ffts/include/ffts.h +++ b/lib/ffts/include/ffts.h @@ -3,6 +3,7 @@ This file is part of FFTS. Copyright (c) 2012, Anthony M. Blake + Copyright (c) 2018, Jukka Ojanen All rights reserved. Redistribution and use in source and binary forms, with or without @@ -75,6 +76,9 @@ typedef struct _ffts_plan_t ffts_plan_t; FFTS_API ffts_plan_t* ffts_init_1d(size_t N, int sign); +FFTS_API ffts_plan_t* +ffts_init_1d_64f(size_t N, int sign); + FFTS_API ffts_plan_t* ffts_init_2d(size_t N1, size_t N2, int sign); diff --git a/lib/ffts/src/Makefile.am b/lib/ffts/src/Makefile.am index 28c7879..ff6b0cc 100644 --- a/lib/ffts/src/Makefile.am +++ b/lib/ffts/src/Makefile.am @@ -2,7 +2,7 @@ lib_LTLIBRARIES = libffts.la -libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c +libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c ffts_chirp_z.c libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h if DYNAMIC_DISABLED @@ -14,7 +14,7 @@ endif libffts_includedir=$(includedir)/ffts libffts_include_HEADERS = ../include/ffts.h -AM_CFLAGS = -I$(top_srcdir)/include +AM_CFLAGS = -I$(top_srcdir)/include -DAUTOTOOLS_BUILD=yes if HAVE_VFP libffts_la_SOURCES += vfp.s diff --git a/lib/ffts/src/codegen.c b/lib/ffts/src/codegen.c index c4e19e6..0bce616 100644 --- a/lib/ffts/src/codegen.c +++ b/lib/ffts/src/codegen.c @@ -139,9 +139,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N #ifdef HAVE_SSE if (sign < 0) { - p->constants = sse_constants; + p->constants = (const void*) sse_constants; } else { - p->constants = sse_constants_inv; + p->constants = (const void*) sse_constants_inv; } #endif diff --git a/lib/ffts/src/codegen_sse.h b/lib/ffts/src/codegen_sse.h index e9819f1..2ca540e 100644 --- a/lib/ffts/src/codegen_sse.h +++ b/lib/ffts/src/codegen_sse.h @@ -488,7 +488,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2); @@ -507,14 +507,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8); x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); @@ -530,7 +530,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9); x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); - x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0); extend--; x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10); @@ -538,10 +538,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15); x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8); - x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0); extend--; - x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0); + x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); @@ -551,7 +551,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0); @@ -580,7 +580,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE); - x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12); @@ -588,7 +588,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7); x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE); - x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0); extend--; x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13); @@ -620,7 +620,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[0], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2); @@ -640,14 +640,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RSI, offsets[6], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3); x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2); @@ -663,7 +663,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9); x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); - x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0); extend--; x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10); @@ -671,10 +671,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15); x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3); - x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0); extend--; - x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0); + x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); @@ -684,7 +684,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0); @@ -713,7 +713,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE); - x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12); @@ -721,7 +721,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7); x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE); - x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0); extend--; x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13); @@ -1157,28 +1157,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2); - x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0); extend--; - x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2); @@ -1206,7 +1206,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); @@ -1218,7 +1218,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1); x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2); - x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0); + x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0); extend--; x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14); @@ -1257,28 +1257,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2); - x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0); extend--; - x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2); - x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0); extend--; x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2); @@ -1306,7 +1306,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1); - x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0); + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0); extend--; x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2); @@ -1318,7 +1318,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1); x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2); - x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0); + x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0); extend--; x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14); diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c index 7fa675a..35c5cad 100644 --- a/lib/ffts/src/ffts.c +++ b/lib/ffts/src/ffts.c @@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts.h" #include "ffts_internal.h" +#include "ffts_chirp_z.h" #include "ffts_static.h" #include "ffts_trig.h" #include "macros.h" @@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = { }; #endif -static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) +static FFTS_INLINE int +ffts_allow_execute(void *start, size_t len) { int result; @@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) return result; } -static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) +static FFTS_INLINE int +ffts_deny_execute(void *start, size_t len) { int result; @@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) return result; } -static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) +static FFTS_INLINE int +ffts_flush_instruction_cache(void *start, size_t length) { #ifdef _WIN32 return !FlushInstructionCache(GetCurrentProcess(), start, length); @@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) #endif } -static FFTS_INLINE void *ffts_vmem_alloc(size_t length) +static FFTS_INLINE void* +ffts_vmem_alloc(size_t length) { #if __APPLE__ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); @@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length) #endif } -static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length) +static FFTS_INLINE void +ffts_vmem_free(void *addr, size_t length) { #ifdef _WIN32 (void) length; @@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p) } } -void ffts_free_1d(ffts_plan_t *p) +static void +ffts_free_1d(ffts_plan_t *p) { #if !defined(DYNAMIC_DISABLED) if (p->transform_base) { @@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p) } if (p->ws) { - FFTS_FREE(p->ws); + ffts_aligned_free(p->ws); } if (p->is) { @@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f); #endif - p->ws = FFTS_MALLOC(lut_size, 32); + p->ws = ffts_aligned_malloc(lut_size); if (!p->ws) { goto cleanup; } @@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) /* calculate factors */ m = leaf_N << (n_luts - 2); - tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32); + tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f)); ffts_generate_cosine_sine_pow2_32f(tmp, m); @@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) p->ws_is[i] = w - (ffts_cpx_32f*) p->ws; if (!i) { - ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32); + ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f)); float *fw0 = (float*) w0; float *fw = (float*) w; @@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w += n/4 * 2; #endif - FFTS_FREE(w0); + ffts_aligned_free(w0); } else { - ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); - ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); - ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); + ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); + ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); + ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); float *fw0 = (float*) w0; float *fw1 = (float*) w1; @@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w += n/8 * 3 * 2; #endif - FFTS_FREE(w0); - FFTS_FREE(w1); - FFTS_FREE(w2); + ffts_aligned_free(w0); + ffts_aligned_free(w1); + ffts_aligned_free(w2); } n *= 2; @@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) } #endif - FFTS_FREE(tmp); + ffts_aligned_free(tmp); p->lastlut = w; p->n_luts = n_luts; @@ -411,18 +418,166 @@ cleanup: return -1; } +#ifdef FFTS_DOUBLE +static int +ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ + V4DF MULI_SIGN; + size_t n_luts; + ffts_cpx_64f *w; + ffts_cpx_64f *tmp; + size_t i, j, m, n; + int stride; + + if (sign < 0) { + MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0); + } else { + MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0); + } + + /* LUTS */ + n_luts = ffts_ctzl(N / leaf_N); + if (n_luts >= 32) { + n_luts = 0; + } + + if (n_luts) { + size_t lut_size; + + lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f); + + p->ws = ffts_aligned_malloc(lut_size); + if (!p->ws) { + goto cleanup; + } + + p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is)); + if (!p->ws_is) { + goto cleanup; + } + } + + w = p->ws; + n = leaf_N * 2; + + /* calculate factors */ + m = leaf_N << (n_luts - 2); + tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f)); + + ffts_generate_cosine_sine_pow2_64f(tmp, m); + + /* generate lookup tables */ + stride = 1 << (n_luts - 1); + for (i = 0; i < n_luts; i++) { + p->ws_is[i] = w - (ffts_cpx_64f*) p->ws; + + if (!i) { + ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f)); + double *fw0 = (double*) w0; + double *fw = (double*) w; + + for (j = 0; j < n/4; j++) { + w0[j][0] = tmp[j * stride][0]; + w0[j][1] = tmp[j * stride][1]; + } + + for (j = 0; j < n/4; j += 2) { + V4DF re, im, temp0; + temp0 = V4DF_LD(fw0 + j*2); + re = V4DF_DUPLICATE_RE(temp0); + im = V4DF_DUPLICATE_IM(temp0); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*4 + 0, re); + V4DF_ST(fw + j*4 + 4, im); + } + + w += n/4 * 2; + ffts_aligned_free(w0); + } else { + ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + + double *fw0 = (double*) w0; + double *fw1 = (double*) w1; + double *fw2 = (double*) w2; + + double *fw = (double*)w; + + for (j = 0; j < n/8; j++) { + w0[j][0] = tmp[2 * j * stride][0]; + w0[j][1] = tmp[2 * j * stride][1]; + + w1[j][0] = tmp[j * stride][0]; + w1[j][1] = tmp[j * stride][1]; + + w2[j][0] = tmp[(j + (n/8)) * stride][0]; + w2[j][1] = tmp[(j + (n/8)) * stride][1]; + } + + for (j = 0; j < n/8; j += 2) { + V4DF temp0, temp1, temp2, re, im; + + temp0 = V4DF_LD(fw0 + j*2); + re = V4DF_DUPLICATE_RE(temp0); + im = V4DF_DUPLICATE_IM(temp0); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+0, re); + V4DF_ST(fw + j*2*6+4, im); + + temp1 = V4DF_LD(fw1 + j*2); + re = V4DF_DUPLICATE_RE(temp1); + im = V4DF_DUPLICATE_IM(temp1); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+8 , re); + V4DF_ST(fw + j*2*6+12, im); + + temp2 = V4DF_LD(fw2 + j*2); + re = V4DF_DUPLICATE_RE(temp2); + im = V4DF_DUPLICATE_IM(temp2); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+16, re); + V4DF_ST(fw + j*2*6+20, im); + } + + w += n/8 * 3 * 2; + ffts_aligned_free(w0); + ffts_aligned_free(w1); + ffts_aligned_free(w2); + } + + n *= 2; + stride >>= 1; + } + + ffts_aligned_free(tmp); + + p->lastlut = w; + p->n_luts = n_luts; + return 0; + +cleanup: + return -1; +} +#endif + FFTS_API ffts_plan_t* ffts_init_1d(size_t N, int sign) { const size_t leaf_N = 8; ffts_plan_t *p; - if (N < 2 || (N & (N - 1)) != 0) { - LOG("FFT size must be a power of two\n"); + if (N < 2) { + LOG("FFT size must be greater than 1"); return NULL; } - p = calloc(1, sizeof(*p)); + /* check if size is not a power of two */ + if (N & (N - 1)) { + return ffts_chirp_z_init(N, sign); + } + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); if (!p) { return NULL; } @@ -537,3 +692,98 @@ cleanup: ffts_free_1d(p); return NULL; } + +#ifdef FFTS_DOUBLE +FFTS_API ffts_plan_t* +ffts_init_1d_64f(size_t N, int sign) +{ + const size_t leaf_N = 8; + ffts_plan_t *p; + + if (N < 2) { + LOG("FFT size must be greater than 1"); + return NULL; + } + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + p->destroy = ffts_free_1d; + p->N = N; + + if (N >= 32) { + /* generate lookup tables */ + if (ffts_generate_luts_64f(p, N, leaf_N, sign)) { + goto cleanup; + } + + p->offsets = ffts_init_offsets(N, leaf_N); + if (!p->offsets) { + goto cleanup; + } + + p->is = ffts_init_is(N, leaf_N, 1); + if (!p->is) { + goto cleanup; + } + + p->i0 = N/leaf_N/3 + 1; + p->i1 = p->i2 = N/leaf_N/3; + if ((N/leaf_N) % 3 > 1) { + p->i1++; + } + + p->i0 /= 2; + p->i1 /= 2; + + if (sign < 0) { + p->transform = ffts_static_transform_f_64f; + } else { + p->transform = ffts_static_transform_i_64f; + } + } else { + switch (N) { + case 2: + p->transform = &ffts_small_2_64f; + break; + case 4: + if (sign == -1) { + p->transform = &ffts_small_forward4_64f; + } else if (sign == 1) { + p->transform = &ffts_small_backward4_64f; + } + break; + case 8: + if (sign == -1) { + p->transform = &ffts_small_forward8_64f; + } else if (sign == 1) { + p->transform = &ffts_small_backward8_64f; + } + break; + case 16: + default: + if (sign == -1) { + p->transform = &ffts_small_forward16_64f; + } else { + p->transform = &ffts_small_backward16_64f; + } + break; + } + } + + return p; + +cleanup: + ffts_free_1d(p); + return NULL; +} +#else +FFTS_API ffts_plan_t* +ffts_init_1d_64f(size_t N, int sign) +{ + /* disabled */ + return NULL; +} +#endif \ No newline at end of file diff --git a/lib/ffts/src/ffts_chirp_z.c b/lib/ffts/src/ffts_chirp_z.c new file mode 100644 index 0000000..e463a55 --- /dev/null +++ b/lib/ffts/src/ffts_chirp_z.c @@ -0,0 +1,225 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2016, Jukka Ojanen + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_chirp_z.h" + +#include "ffts_internal.h" +#include "ffts_trig.h" + +/* +* For more information on algorithms: +* +* L. I. Bluestein, A linear filtering approach to the computation of +* the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219 +* +* Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader, +* The Chirp z-Transform Algorithm and Its Application +* Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969. +* +* Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm +* https://www.dsprelated.com/showarticle/800.php, July 7, 2015 +*/ + +/* forward declarations */ +static void +ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out); + +static void +ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out); + +static void +ffts_chirp_z_free(ffts_plan_t *p) +{ + if (p->B) + ffts_aligned_free(p->B); + + if (p->A) + ffts_aligned_free(p->A); + + if (p->buf) + ffts_aligned_free(p->buf); + + if (p->plans[0]) + ffts_free(p->plans[0]); + + free(p); +} + +ffts_plan_t* +ffts_chirp_z_init(size_t N, int sign) +{ + float *A, *B, reciprocal_M, *tmp; + ffts_plan_t *p; + size_t i, M; + + FFTS_ASSUME(N > 2); + + p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans)); + if (!p) + return NULL; + + p->destroy = ffts_chirp_z_free; + p->N = N; + p->rank = 1; + p->plans = (ffts_plan_t**) &p[1]; + + if (sign < 0) + p->transform = ffts_chirp_z_transform_f_32f; + else + p->transform = ffts_chirp_z_transform_i_32f; + + /* determinate next power of two such that M >= 2*N-1 */ + M = ffts_next_power_of_2(2*N-1); + p->plans[0] = ffts_init_1d(M, FFTS_FORWARD); + if (!p->plans[0]) + goto cleanup; + + p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float)); + if (!p->A) + goto cleanup; + + p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float)); + if (!p->B) + goto cleanup; + + p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float)); + + ffts_generate_chirp_32f((ffts_cpx_32f*) A, N); + + /* scale with reciprocal of length */ + reciprocal_M = 1.0f / M; + tmp[0] = A[0] * reciprocal_M; + tmp[1] = A[1] * reciprocal_M; + for (i = 1; i < N; ++i) { + tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M; + tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M; + } + + /* zero pad */ + for (; i <= M - N; ++i) + tmp[2 * i] = tmp[2 * i + 1] = 0.0f; + + /* FFT */ + p->plans[0]->transform(p->plans[0], tmp, B); + return p; + +cleanup: + ffts_chirp_z_free(p); + return NULL; +} + +static void +ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out) +{ + const float *A = FFTS_ASSUME_ALIGNED_32(p->A); + const float *B = FFTS_ASSUME_ALIGNED_32(p->B); + size_t i, M = p->plans[0]->N, N = p->N; + float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf); + float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]); + const float *din = (const float*) in; + float *dout = (float*) out; + + /* we know this */ + FFTS_ASSUME(M >= 8); + + /* multiply input with conjugated sequence */ + for (i = 0; i < N; ++i) { + t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1]; + t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1]; + } + + /* zero pad */ + for (; i < M; ++i) + t1[2 * i] = t1[2 * i + 1] = 0.0f; + + /* convolution using FFT */ + p->plans[0]->transform(p->plans[0], t1, t2); + + /* complex multiply */ + for (i = 0; i < M; ++i) { + t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1]; + t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1]; + } + + /* IFFT using FFT with real and imaginary parts swapped */ + p->plans[0]->transform(p->plans[0], t1, t2); + + /* multiply output with conjugated sequence */ + for (i = 0; i < N; ++i) { + dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1]; + dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1]; + } +} + +/* IFFT using FFT with real and imaginary parts swapped */ +static void +ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out) +{ + const float *A = FFTS_ASSUME_ALIGNED_32(p->A); + const float *B = FFTS_ASSUME_ALIGNED_32(p->B); + size_t i, M = p->plans[0]->N, N = p->N; + float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf); + float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]); + const float *din = (const float*) in; + float *dout = (float*) out; + + /* we know this */ + FFTS_ASSUME(M >= 8); + + /* multiply input with conjugated sequence */ + for (i = 0; i < N; ++i) { + t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1]; + t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1]; + } + + /* zero pad */ + for (; i < M; ++i) + t1[2 * i] = t1[2 * i + 1] = 0.0f; + + /* convolution using FFT */ + p->plans[0]->transform(p->plans[0], t1, t2); + + /* complex multiply */ + for (i = 0; i < M; ++i) { + t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1]; + t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1]; + } + + /* IFFT using FFT with real and imaginary parts swapped */ + p->plans[0]->transform(p->plans[0], t1, t2); + + /* multiply output with conjugated sequence */ + for (i = 0; i < N; ++i) { + dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1]; + dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1]; + } +} diff --git a/lib/ffts/src/ffts_chirp_z.h b/lib/ffts/src/ffts_chirp_z.h new file mode 100644 index 0000000..2a6aa7b --- /dev/null +++ b/lib/ffts/src/ffts_chirp_z.h @@ -0,0 +1,45 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2016, Jukka Ojanen + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_CHIRP_Z_H +#define FFTS_CHIRP_Z_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include "ffts.h" + +ffts_plan_t* +ffts_chirp_z_init(size_t N, int sign); + +#endif /* FFTS_CHIRP_Z_H */ diff --git a/lib/ffts/src/ffts_cpu.c b/lib/ffts/src/ffts_cpu.c new file mode 100644 index 0000000..daf92c8 --- /dev/null +++ b/lib/ffts/src/ffts_cpu.c @@ -0,0 +1,371 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2018, Jukka Ojanen + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_cpu.h" + +#if defined(FFTS_BUILDING_CPU_TEST) +#include +#endif + +#if defined(_WIN32) +#include +#include +#endif + +/* TODO: add detection/declaration of these to CMake phase */ +#if !defined(FFTS_CPU_X64) +#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__) +/* 64 bit x86 detected */ +#define FFTS_CPU_X64 +#endif +#endif + +#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86) +#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) +/* 32 bit x86 detected */ +#define FFTS_CPU_X86 +#endif +#endif + +/* check if build is 32 bit or 64 bit x86 */ +#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86) + +/* Build and tested on +CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313 +Mac OSX 10.9 - Apple Clang 6.0 +Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4 +Windows XP SP3 - Visual Studio 2005 SP1 x86/x64 +Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64 +Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64 +Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1) +Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3) +Windows 10 Pro - Visual Studio 2017 x86/x64 +*/ + +/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */ +#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219) +#define FFTS_HAVE_XGETBV +#endif + +#ifndef BIT +#define BIT(n) (1u << n) +#endif + +/* bit masks */ +#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25)) +#define FFTS_CPU_X86_SSE2_BITS (BIT(26)) +#define FFTS_CPU_X86_SSE3_BITS (BIT(0)) +#define FFTS_CPU_X86_SSSE3_BITS (BIT(9)) +#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19)) +#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23)) +#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28)) +#define FFTS_CPU_X86_XCR0_BITS ( +#define FFTS_CPU_X86_AVX2_BITS (BIT(5)) +#define FFTS_CPU_X86_AVX512_BITS (BIT(16)) + +/* Visual Studio 2008 or older */ +#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500 +#pragma optimize("", off) +static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf) +{ + /* x64 uses a four register fast-call calling convention by default and + arguments are passed in registers RCX, RDX, R8, and R9. By disabling + optimization and passing subleaf as first argument we get __cpuidex + */ + (void) subleaf; + __cpuid(regs, leaf); +} +#pragma optimize("", on) +#endif + +static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf) +{ +#if defined(_MSC_VER) +#if defined(FFTS_CPU_X64) + /* Visual Studio 2010 or newer */ +#if _MSC_VER > 1500 + __cpuidex(regs, leaf, subleaf); +#else + ffts_cpuidex(subleaf, regs, leaf); +#endif +#else + __asm { + mov eax, leaf + mov ecx, subleaf + mov esi, regs + cpuid + mov [esi + 0x0], eax + mov [esi + 0x4], ebx + mov [esi + 0x8], ecx + mov [esi + 0xc], edx + } +#endif +#elif defined(__GNUC__) && __GNUC__ +#if defined(FFTS_CPU_X64) + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#elif defined(__PIC__) + __asm__ __volatile__( + "xchgl %%ebx, %1\n\t" + "cpuid \n\t" + "xchgl %%ebx, %1\n\t" + : "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#else + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#endif +#else + /* unknown compiler for x86 */ + regs[0] = regs[1] = regs[2] = regs[3] = 0; +#endif +} + +/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */ +#if defined(FFTS_HAVE_XGETBV) +#pragma optimize("", off) +#endif +static FFTS_INLINE unsigned int ffts_get_xcr0(void) +{ +#if defined(FFTS_HAVE_XGETBV) + return (unsigned int) _xgetbv(0); +#elif defined(_MSC_VER) +#if defined(FFTS_CPU_X64) + /* emulate xgetbv(0) on Windows 7 SP1 or newer */ + typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID); + PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures = + (PGETENABLEDXSTATEFEATURES) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures"); + return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0; +#else + /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */ + unsigned __int32 hi, lo; + __asm { + xor ecx, ecx + _emit 0x0f + _emit 0x01 + _emit 0xd0 + mov lo, eax + mov hi, edx + } + return (unsigned int) lo; +#endif +#elif defined(__GNUC__) && __GNUC__ + unsigned int lo; + __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n" + : "=a"(lo) + : "c"(0) + : "edx"); + return lo; +#else + /* unknown x86 compiler */ + return 0; +#endif +} +#if defined(FFTS_HAVE_XGETBV) +#pragma optimize("", on) +#endif + +int +ffts_cpu_detect(int *extra_flags) +{ + static int cpu_flags = -1; + static int cpu_extra_flags = -1; + int max_basic_func; + int regs[4]; + unsigned int xcr0; + + if (cpu_flags >= 0) { + goto exit; + } + + /* initialize */ + cpu_flags = cpu_extra_flags = 0; + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid check: "); +#endif +#if defined(FFTS_CPU_X64) + /* cpuid is always supported on x64 */ +#if defined(FFTS_BUILDING_CPU_TEST) + printf("skipped\n"); +#endif +#else +#if defined(_MSC_VER) + _asm { + pushfd + pop eax + mov ebx,eax + xor eax,200000h + push eax + popfd + pushfd + pop eax + push ebx + popfd + mov regs[0 * TYPE regs],eax + mov regs[1 * TYPE regs],ebx + } +#else + __asm__ ( + "pushfl\n\t" + "pop %0\n\t" + "movl %0,%1\n\t" + "xorl $0x200000,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "pushl %1\n\t" + "popfl\n\t" + : "=r" (regs[0]), "=r" (regs[1]) + ); +#endif + /* check CPUID bit (bit 21) in EFLAGS register can be toggled */ + if (((regs[0] ^ regs[1]) & 0x200000) == 0) { +#if defined(FFTS_BUILDING_CPU_TEST) + printf("not supported\n"); +#endif + goto exit; + } +#if defined(FFTS_BUILDING_CPU_TEST) + printf("supported\n"); +#endif +#endif + + /* get the number of basic functions */ + ffts_cpuid(regs, 0, 0); + max_basic_func = regs[0]; +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=0, ecx=0: %d\n", max_basic_func); +#endif + if (max_basic_func == 0) + goto exit; + + /* get feature flags */ + ffts_cpuid(regs, 1, 0); + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]); +#endif + +#if defined(FFTS_CPU_X64) + /* minimum for any x64 */ + cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2; +#else + /* test if SSE is supported */ + if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS) + goto exit; + cpu_flags = FFTS_CPU_X86_SSE; + + /* test if SSE2 is supported */ + if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE2; +#endif + + /* test if SSE3 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE3; + + /* test if SSSE3 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSSE3; + + /* test if SSE4.1 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE4_1; + + /* test if SSE4.2 is supported */ + if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE4_2; + + /* test if AVX is supported */ + if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS) + goto exit; + + /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */ + xcr0 = ffts_get_xcr0(); +#if defined(FFTS_BUILDING_CPU_TEST) + printf("xcr0: %u\n", xcr0); +#endif + if ((xcr0 & 0x6) != 0x6) + goto exit; + + cpu_flags |= FFTS_CPU_X86_AVX; + + /* check that cpuid extended features exist */ + if (max_basic_func < 7) + goto exit; + + /* get extended features */ + ffts_cpuid(regs, 7, 0); + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]); +#endif + + /* test if AVX2 is supported */ + if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_AVX2; + + /* test if AVX512 is supported */ + if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_AVX512; + +exit: + if (extra_flags) { + *extra_flags = cpu_extra_flags; + } + return cpu_flags; +} +#else +int +ffts_cpu_detect(int *extra_flags) +{ + /* not implemented */ +#if defined(FFTS_BUILDING_CPU_TEST) + printf("CPU detection not implemented!!\n"); +#endif + return 0; +} +#endif \ No newline at end of file diff --git a/lib/ffts/src/ffts_cpu.h b/lib/ffts/src/ffts_cpu.h new file mode 100644 index 0000000..37d77e4 --- /dev/null +++ b/lib/ffts/src/ffts_cpu.h @@ -0,0 +1,54 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2018, Jukka Ojanen + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_CPU_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include "ffts_internal.h" + +#define FFTS_CPU_X86_SSE 0x001 +#define FFTS_CPU_X86_SSE2 0x002 +#define FFTS_CPU_X86_SSE3 0x004 +#define FFTS_CPU_X86_SSSE3 0x008 +#define FFTS_CPU_X86_SSE4_1 0x010 +#define FFTS_CPU_X86_SSE4_2 0x020 +#define FFTS_CPU_X86_AVX 0x040 +#define FFTS_CPU_X86_AVX2 0x080 +#define FFTS_CPU_X86_AVX512 0x100 + +int +ffts_cpu_detect(int *extra_flags); + +#endif /* FFTS_CPU_H */ diff --git a/lib/ffts/src/ffts_internal.h b/lib/ffts/src/ffts_internal.h index 157c283..04ebb9c 100644 --- a/lib/ffts/src/ffts_internal.h +++ b/lib/ffts/src/ffts_internal.h @@ -2,6 +2,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South +Copyright (c) 2015-2016, Jukka Ojanen Copyright (c) 2012, Anthony M. Blake Copyright (c) 2012, The University of Waikato @@ -34,7 +35,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef FFTS_INTERNAL_H #define FFTS_INTERNAL_H +#ifdef AUTOTOOLS_BUILD #include "config.h" +#endif + #include "ffts_attributes.h" #include "types.h" @@ -42,18 +46,59 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef HAVE_MM_ALLOC_H +#include +#ifndef HAVE__MM_MALLOC +#define HAVE__MM_MALLOC +#endif +#endif + #include -#ifdef HAVE_STDINT_H +#ifdef HAVE_INTTYPES_H +#include +#elif HAVE_STDINT_H #include +#elif _MSC_VER +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +#else +typedef signed long int int32_t; +typedef unsigned long int uint32_t; +typedef signed long long int int64_t; +typedef unsigned long long int uint64_t; #endif #ifdef HAVE_STDLIB_H #include #endif +#ifdef HAVE_STRING_H +#include +#endif + #include +#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN +extern void *memalign(size_t, size_t); +#endif + +#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN +extern int posix_memalign(void **, size_t, size_t); +#endif + +#if defined(HAVE_DECL_VALLOC) && !HAVE_DECL_VALLOC +extern void *valloc(size_t); +#endif + +#ifdef _mm_malloc +#ifndef HAVE__MM_MALLOC +#define HAVE__MM_MALLOC +#endif +#endif + #ifdef ENABLE_LOG #ifdef __ANDROID__ #include @@ -142,11 +187,9 @@ struct _ffts_plan_t { */ size_t transform_size; - /** - * Points to the cosnant variables used by - * the Assembly Code - */ - void *constants; + /* pointer to the constant variable used by SSE for sign change */ + /* TODO: #ifdef HAVE_SSE */ + const void *constants; // multi-dimensional stuff: struct _ffts_plan_t **plans; @@ -171,44 +214,96 @@ struct _ffts_plan_t { size_t i2; }; -static FFTS_INLINE void *ffts_aligned_malloc(size_t size) +static FFTS_INLINE void* +ffts_aligned_malloc(size_t size) { -#if defined(_WIN32) - return _aligned_malloc(size, 32); + void *p = NULL; + + /* various ways to allocate aligned memory in order of preferance */ +#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC) + p = (void*) _mm_malloc(size, 32); +#elif defined(HAVE_POSIX_MEMALIGN) + if (posix_memalign(&p, 32, size)) + p = NULL; +#elif defined(HAVE_MEMALIGN) + p = memalign(32, size); +#elif defined(__ALTIVEC__) + p = vec_malloc(size); +#elif defined(_MSC_VER) || defined(WIN32) + p = _aligned_malloc(size, 32); +#elif defined(HAVE_VALLOC) + p = valloc(size); #else - return valloc(size); + p = malloc(size); #endif + + return p; } -static FFTS_INLINE void ffts_aligned_free(void *p) +static FFTS_INLINE +void ffts_aligned_free(void *p) { -#if defined(_WIN32) + /* order must match with ffts_aligned_malloc */ +#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC) + _mm_free(p); +#elif defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN) + free(p); +#elif defined(__ALTIVEC__) + vec_free(p); +#elif defined(_MSC_VER) || defined(WIN32) _aligned_free(p); #else + /* valloc or malloc */ free(p); #endif } #if GCC_VERSION_AT_LEAST(3,3) #define ffts_ctzl __builtin_ctzl + +static FFTS_INLINE size_t +ffts_next_power_of_2(size_t N) +{ + return 1 << (32 - __builtin_clzl(N)); +} #elif defined(_MSC_VER) #include #ifdef _M_X64 #pragma intrinsic(_BitScanForward64) -static __inline unsigned long ffts_ctzl(size_t N) +static FFTS_INLINE unsigned long +ffts_ctzl(size_t N) { unsigned long count; _BitScanForward64((unsigned long*) &count, N); return count; } + +#pragma intrinsic(_BitScanReverse64) +static FFTS_INLINE size_t +ffts_next_power_of_2(size_t N) +{ + unsigned long log_2; + _BitScanReverse64((unsigned long*)&log_2, N); + return 1ULL << (log_2 + 1); +} #else #pragma intrinsic(_BitScanForward) -static __inline unsigned long ffts_ctzl(size_t N) +static FFTS_INLINE unsigned long +ffts_ctzl(size_t N) { unsigned long count; _BitScanForward((unsigned long*) &count, N); return count; } + +#pragma intrinsic(_BitScanReverse) +static FFTS_INLINE size_t +ffts_next_power_of_2(size_t N) +{ + unsigned long log_2; + _BitScanReverse((unsigned long*)&log_2, N); + return 1 << (log_2 + 1); +} #endif /* _WIN64 */ #endif /* _MSC_VER */ diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c index 0f87a12..e0f0e1f 100644 --- a/lib/ffts/src/ffts_real.c +++ b/lib/ffts/src/ffts_real.c @@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake Copyright (c) 2012, The University of Waikato -Copyright (c) 2015, Jukka Ojanen +Copyright (c) 2015 - 2018, Jukka Ojanen All rights reserved. @@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real.h" +#include "ffts_cpu.h" #include "ffts_internal.h" #include "ffts_trig.h" @@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #elif HAVE_INTRIN_H #include -#else +#endif + /* avoid using negative zero as some configurations have problems with those */ static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = { 0x80000000, 0, 0x80000000, 0 @@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = { 0, 0x80000000, 0, 0x80000000 }; #endif -#endif static void ffts_free_1d_real(ffts_plan_t *p) @@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p) free(p); } +#ifdef __ARM_NEON__ static void -ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) +ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output) { float *const FFTS_RESTRICT out = (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); @@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) const float *const FFTS_RESTRICT B = (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); const int N = (const int) p->N; - int i; - -#ifdef __ARM_NEON__ float *p_buf0 = buf; float *p_buf1 = buf + N - 2; float *p_out = out; -#endif + int i; /* we know this */ FFTS_ASSUME(N/2 > 0); p->plans[0]->transform(p->plans[0], input, buf); -#ifndef HAVE_SSE buf[N + 0] = buf[0]; buf[N + 1] = buf[1]; -#endif -#ifdef __ARM_NEON__ for (i = 0; i < N; i += 4) { __asm__ __volatile__ ( "vld1.32 {q8}, [%[pa]]!\n\t" @@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE3 + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +#if HAVE_SSE3 +static void +ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_load_ps(buf); __m128 t1 = _mm_load_ps(buf + N - 4); @@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } -#elif HAVE_SSE + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +#ifdef HAVE_SSE +static void +ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); @@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _MM_SHUFFLE(2,3,0,1))))); } } -#else + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; +} +#endif + +static void +ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + + p->plans[0]->transform(p->plans[0], input, buf); + + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; + for (i = 0; i < N/2; i++) { out[2*i + 0] = buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] + @@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] + buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; } -#endif out[N + 0] = buf[0] - buf[1]; out[N + 1] = 0.0f; } +#ifdef __ARM_NEON__ static void -ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) +ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output) { float *const FFTS_RESTRICT in = (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); @@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) const float *const FFTS_RESTRICT B = (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); const int N = (const int) p->N; - int i; - -#ifdef __ARM_NEON__ float *p_buf0 = in; float *p_buf1 = in + N - 2; float *p_out = buf; -#endif + int i; /* we know this */ FFTS_ASSUME(N/2 > 0); -#ifdef __ARM_NEON__ for (i = 0; i < N/2; i += 2) { __asm__ __volatile__ ( "vld1.32 {q8}, [%[pa]]!\n\t" @@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE3 + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +#if HAVE_SSE3 +static void +ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); @@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } -#elif HAVE_SSE + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +#if HAVE_SSE +static void +ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); @@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_xor_ps(t4, c0)))); } } -#else + + p->plans[0]->transform(p->plans[0], buf, output); +} +#endif + +static void +ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) +{ + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; + + /* we know this */ + FFTS_ASSUME(N/2 > 0); + for (i = 0; i < N/2; i++) { buf[2*i + 0] = in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + @@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; } -#endif p->plans[0]->transform(p->plans[0], buf, output); } @@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) FFTS_API ffts_plan_t* ffts_init_1d_real(size_t N, int sign) { +#ifndef __ARM_NEON__ + int cpu_flags = ffts_cpu_detect(NULL); +#endif ffts_plan_t *p; + int invert = 0; p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans)); if (!p) { return NULL; } - if (sign < 0) { - p->transform = &ffts_execute_1d_real; - } else { - p->transform = &ffts_execute_1d_real_inv; +#ifdef __ARM_NEON__ + p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv; +#else +#ifdef HAVE_SSE3 + if (cpu_flags & FFTS_CPU_X86_SSE3) { + p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3; + invert = 1; + } else +#endif +#ifdef HAVE_SSE + if (cpu_flags & FFTS_CPU_X86_SSE) { + p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse; + } else +#endif + { + p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv; } +#endif p->destroy = &ffts_free_1d_real; p->N = N; @@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign) goto cleanup; } -#ifdef HAVE_SSE3 - ffts_generate_table_1d_real_32f(p, sign, 1); -#else - ffts_generate_table_1d_real_32f(p, sign, 0); -#endif - + ffts_generate_table_1d_real_32f(p, sign, invert); return p; cleanup: diff --git a/lib/ffts/src/ffts_static.c b/lib/ffts/src/ffts_static.c index 09de6d7..87d8b23 100644 --- a/lib/ffts/src/ffts_static.c +++ b/lib/ffts/src/ffts_static.c @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake Copyright (c) 2012, The University of Waikato +Copyright (c) 2018, Jukka Ojanen All rights reserved. @@ -258,6 +259,29 @@ static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = { -0.7071067811865475244008443621048490392848359376884740 }; +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_K_0(int inv, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3; + + t0 = *r0; + t1 = *r1; + + t2 = V4DF_ADD(*r2, *r3); + t3 = V4DF_IMULI(inv, V4DF_SUB(*r2, *r3)); + + *r0 = V4DF_ADD(t0, t2); + *r2 = V4DF_SUB(t0, t2); + *r1 = V4DF_SUB(t1, t3); + *r3 = V4DF_ADD(t1, t3); +} +#endif + static FFTS_INLINE void V4SF_K_0(int inv, V4SF *r0, @@ -279,6 +303,31 @@ V4SF_K_0(int inv, *r3 = V4SF_ADD(t1, t3); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_2(const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + *r0 = V4DF_ADD(t0, t1); + *r1 = V4DF_SUB(t0, t1); + *r2 = V4DF_ADD(t2, t3); + *r3 = V4DF_SUB(t2, t3); +} +#endif + static FFTS_INLINE void V4SF_L_2(const float *FFTS_RESTRICT i0, const float *FFTS_RESTRICT i1, @@ -302,6 +351,37 @@ V4SF_L_2(const float *FFTS_RESTRICT i0, *r3 = V4SF_SUB(t2, t3); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_4(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3)); + + *r0 = V4DF_ADD(t4, t6); + *r2 = V4DF_SUB(t4, t6); + *r1 = V4DF_SUB(t5, t7); + *r3 = V4DF_ADD(t5, t7); +} +#endif + static FFTS_INLINE void V4SF_L_4(int inv, const float *FFTS_RESTRICT i0, @@ -331,6 +411,36 @@ V4SF_L_4(int inv, *r3 = V4SF_ADD(t5, t7); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_LEAF_EE(double *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const double *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) +{ + const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f; + + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + double *out0 = out + os[0]; + double *out1 = out + os[1]; + + V4DF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4DF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); + + V4DF_K_0(inv, &r0, &r2, &r4, &r6); + V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7); + V4DF_TX2(&r0, &r1); + V4DF_TX2(&r2, &r3); + V4DF_TX2(&r4, &r5); + V4DF_TX2(&r6, &r7); + + V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); +} +#endif + static FFTS_INLINE void V4SF_LEAF_EE(float *const FFTS_RESTRICT out, const ptrdiff_t *FFTS_RESTRICT os, @@ -359,6 +469,36 @@ V4SF_LEAF_EE(float *const FFTS_RESTRICT out, V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_LEAF_EE2(double *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const double *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) +{ + const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f; + + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + double *out0 = out + os[0]; + double *out1 = out + os[1]; + + V4DF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3); + V4DF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7); + + V4DF_K_0(inv, &r0, &r2, &r4, &r6); + V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7); + V4DF_TX2(&r0, &r1); + V4DF_TX2(&r2, &r3); + V4DF_TX2(&r4, &r5); + V4DF_TX2(&r6, &r7); + + V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); +} +#endif + static FFTS_INLINE void V4SF_LEAF_EE2(float *const FFTS_RESTRICT out, const ptrdiff_t *FFTS_RESTRICT os, @@ -387,6 +527,30 @@ V4SF_LEAF_EE2(float *const FFTS_RESTRICT out, V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_LEAF_EO(double *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const double *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) +{ + const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f; + + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + double *out0 = out + os[0]; + double *out1 = out + os[1]; + + V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4DF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); + + V4DF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r1, &r4, &r5); + V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); +} +#endif + static FFTS_INLINE void V4SF_LEAF_EO(float *const FFTS_RESTRICT out, const ptrdiff_t *FFTS_RESTRICT os, @@ -409,6 +573,30 @@ V4SF_LEAF_EO(float *const FFTS_RESTRICT out, V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_LEAF_OE(double *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const double *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) +{ + const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f; + + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + double *out0 = out + os[0]; + double *out1 = out + os[1]; + + V4DF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); + + V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r6, &r7, &r2, &r3); + V4DF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12); +} +#endif + static FFTS_INLINE void V4SF_LEAF_OE(float *const FFTS_RESTRICT out, const ptrdiff_t *FFTS_RESTRICT os, @@ -431,6 +619,27 @@ V4SF_LEAF_OE(float *const FFTS_RESTRICT out, V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_LEAF_OO(double *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const double *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) +{ + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + double *out0 = out + os[0]; + double *out1 = out + os[1]; + + V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); + + V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4DF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); +} +#endif + static FFTS_INLINE void V4SF_LEAF_OO(float *const FFTS_RESTRICT out, const ptrdiff_t *FFTS_RESTRICT os, @@ -450,6 +659,34 @@ V4SF_LEAF_OO(float *const FFTS_RESTRICT out, V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_X_4(int inv, + double *FFTS_RESTRICT data, + size_t N, + const double *FFTS_RESTRICT LUT) +{ + size_t i; + + for (i = 0; i < N/8; i++) { + V4DF r0 = V4DF_LD(data); + V4DF r1 = V4DF_LD(data + 2*N/4); + V4DF r2 = V4DF_LD(data + 4*N/4); + V4DF r3 = V4DF_LD(data + 6*N/4); + + V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3); + + V4DF_ST(data , r0); + V4DF_ST(data + 2*N/4, r1); + V4DF_ST(data + 4*N/4, r2); + V4DF_ST(data + 6*N/4, r3); + + LUT += 8; + data += 4; + } +} +#endif + static FFTS_INLINE void V4SF_X_4(int inv, float *FFTS_RESTRICT data, @@ -536,6 +773,68 @@ V4SF_X_8(int inv, } } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_X_8(int inv, + double *FFTS_RESTRICT data0, + size_t N, + const double *FFTS_RESTRICT LUT) +{ + double *data1 = data0 + 1*N/4; + double *data2 = data0 + 2*N/4; + double *data3 = data0 + 3*N/4; + double *data4 = data0 + 4*N/4; + double *data5 = data0 + 5*N/4; + double *data6 = data0 + 6*N/4; + double *data7 = data0 + 7*N/4; + size_t i; + + for (i = 0; i < N/16; i++) { + V4DF r0, r1, r2, r3, r4, r5, r6, r7; + + r0 = V4DF_LD(data0); + r1 = V4DF_LD(data1); + r2 = V4DF_LD(data2); + r3 = V4DF_LD(data3); + + V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3); + r4 = V4DF_LD(data4); + r6 = V4DF_LD(data6); + + V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r2, &r4, &r6); + r5 = V4DF_LD(data5); + r7 = V4DF_LD(data7); + + V4DF_K_N(inv, V4DF_LD(LUT + 16), V4DF_LD(LUT + 20), &r1, &r3, &r5, &r7); + LUT += 24; + + V4DF_ST(data0, r0); + data0 += 4; + + V4DF_ST(data1, r1); + data1 += 4; + + V4DF_ST(data2, r2); + data2 += 4; + + V4DF_ST(data3, r3); + data3 += 4; + + V4DF_ST(data4, r4); + data4 += 4; + + V4DF_ST(data5, r5); + data5 += 4; + + V4DF_ST(data6, r6); + data6 += 4; + + V4DF_ST(data7, r7); + data7 += 4; + } +} +#endif + static FFTS_INLINE void ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out, const float *FFTS_RESTRICT in, @@ -569,6 +868,41 @@ ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out, } } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +ffts_static_firstpass_odd_64f(double *const FFTS_RESTRICT out, + const double *FFTS_RESTRICT in, + const ffts_plan_t *FFTS_RESTRICT p, + int inv) +{ + size_t i, i0 = p->i0, i1 = p->i1; + const ptrdiff_t *is = (const ptrdiff_t*) p->is; + const ptrdiff_t *os = (const ptrdiff_t*) p->offsets; + + for (i = i0; i > 0; --i) { + V4DF_LEAF_EE(out, os, in, is, inv); + in += 4; + os += 2; + } + + for (i = i1; i > 0; --i) { + V4DF_LEAF_OO(out, os, in, is, inv); + in += 4; + os += 2; + } + + V4DF_LEAF_OE(out, os, in, is, inv); + in += 4; + os += 2; + + for (i = i1; i > 0; --i) { + V4DF_LEAF_EE2(out, os, in, is, inv); + in += 4; + os += 2; + } +} +#endif + void ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out) { @@ -789,23 +1123,23 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out) V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } +#ifdef FFTS_DOUBLE void ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out) { + const double *FFTS_RESTRICT lut = ffts_constants_small_64f; const double *din = (const double*) in; double *dout = (double*) out; -// V4SF r0_1, r2_3, r4_5, r6_7; -// double *LUT8 = (double*) p->ws + p->ws_is[0]; + V4DF r0_1, r2_3, r4_5, r6_7; + + /* unreferenced parameter */ (void) p; - (void) din; - (void) dout; -#if MACROS_READY - L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); - K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); -#endif + V4DF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut + 4), &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } +#endif void ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out) @@ -823,24 +1157,23 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out) V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } +#ifdef FFTS_DOUBLE void ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out) { + const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f; const double *din = (const double*) in; double *dout = (double*) out; -// V4SF r0_1, r2_3, r4_5, r6_7; -// double *LUT8 = (double*) p->ws + p->ws_is[0]; - (void) p; - (void) din; - (void) dout; + V4DF r0_1, r2_3, r4_5, r6_7; + /* unreferenced parameter */ + (void) p; -#if MACROS_READY - L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); - K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); -#endif + V4DF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } +#endif void ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out) @@ -862,27 +1195,27 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out) V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } +#ifdef FFTS_DOUBLE void ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out) { + const double *FFTS_RESTRICT lut = ffts_constants_small_64f; const double *din = (const double*) in; double *dout = (double*) out; -// double *LUT8 = (double*) p->ws; -// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + + /* unreferenced parameter */ (void) p; - (void) din; - (void) dout; - -#ifdef MACROS_READY - L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); - L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); - K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13); - S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); - K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); - S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); -#endif + + V4DF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + V4DF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_K_N(0, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13); + V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + V4DF_K_N(0, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15); + V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } +#endif void ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out) @@ -904,27 +1237,27 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out) V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } +#ifdef FFTS_DOUBLE void ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out) { + const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f; const double *din = (const double*) in; double *dout = (double*) out; -// double *LUT8 = (double*) p->ws; -// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + + /* unreferenced parameter */ (void) p; - (void) din; - (void) dout; - -#ifdef MACROS_READY - L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); - L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); - K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13); - S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); - K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); - S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); -#endif + + V4DF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + V4DF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4DF_K_N(1, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13); + V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + V4DF_K_N(1, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15); + V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } +#endif static FFTS_INLINE void ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out, @@ -959,6 +1292,41 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out, } } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +ffts_static_firstpass_even_64f(double *FFTS_RESTRICT out, + const double *FFTS_RESTRICT in, + const ffts_plan_t *FFTS_RESTRICT p, + int inv) +{ + size_t i, i0 = p->i0, i1 = p->i1; + const ptrdiff_t *is = (const ptrdiff_t*) p->is; + const ptrdiff_t *os = (const ptrdiff_t*) p->offsets; + + for(i = i0; i > 0; --i) { + V4DF_LEAF_EE(out, os, in, is, inv); + in += 4; + os += 2; + } + + V4DF_LEAF_EO(out, os, in, is, inv); + in += 4; + os += 2; + + for (i = i1; i > 0; --i) { + V4DF_LEAF_OO(out, os, in, is, inv); + in += 4; + os += 2; + } + + for (i = i1; i > 0; --i) { + V4DF_LEAF_EE2(out, os, in, is, inv); + in += 4; + os += 2; + } +} +#endif + static void ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) { @@ -1035,6 +1403,47 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) #endif } +#ifdef FFTS_DOUBLE +static void +ffts_static_rec_f_64f(const ffts_plan_t *p, double *data, size_t N) +{ + const double *ws = (const double*) p->ws; + + if (N > 128) { + const size_t N1 = N >> 1; + const size_t N2 = N >> 2; + const size_t N3 = N >> 3; + + ffts_static_rec_f_64f(p, data , N2); + ffts_static_rec_f_64f(p, data + N1 , N3); + ffts_static_rec_f_64f(p, data + N1 + N2, N3); + ffts_static_rec_f_64f(p, data + N , N2); + ffts_static_rec_f_64f(p, data + N + N1 , N2); + + V4DF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); + } else if (N == 128) { + const double *ws1 = ws + (p->ws_is[1] << 1); + + V4DF_X_8(0, data + 0, 32, ws1); + V4DF_X_4(0, data + 64, 16, ws); + V4DF_X_4(0, data + 96, 16, ws); + V4DF_X_8(0, data + 128, 32, ws1); + V4DF_X_8(0, data + 192, 32, ws1); + + V4DF_X_8(0, data, 128, ws + (p->ws_is[3] << 1)); + } else if (N == 64) { + V4DF_X_4(0, data + 0, 16, ws); + V4DF_X_4(0, data + 64, 16, ws); + V4DF_X_4(0, data + 96, 16, ws); + + V4DF_X_8(0, data, 64, ws + (p->ws_is[2] << 1)); + } else { + assert(N == 32); + V4DF_X_8(0, data, 32, ws + (p->ws_is[1] << 1)); + } +} +#endif + static void ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) { @@ -1111,6 +1520,47 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) #endif } +#ifdef FFTS_DOUBLE +static void +ffts_static_rec_i_64f(const ffts_plan_t *p, double *data, size_t N) +{ + const double *ws = (const double*) p->ws; + + if (N > 128) { + const size_t N1 = N >> 1; + const size_t N2 = N >> 2; + const size_t N3 = N >> 3; + + ffts_static_rec_i_64f(p, data , N2); + ffts_static_rec_i_64f(p, data + N1 , N3); + ffts_static_rec_i_64f(p, data + N1 + N2, N3); + ffts_static_rec_i_64f(p, data + N , N2); + ffts_static_rec_i_64f(p, data + N + N1 , N2); + + V4DF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); + } else if (N == 128) { + const double *ws1 = ws + (p->ws_is[1] << 1); + + V4DF_X_8(1, data + 0, 32, ws1); + V4DF_X_4(1, data + 64, 16, ws); + V4DF_X_4(1, data + 96, 16, ws); + V4DF_X_8(1, data + 128, 32, ws1); + V4DF_X_8(1, data + 192, 32, ws1); + + V4DF_X_8(1, data, 128, ws + (p->ws_is[3] << 1)); + } else if (N == 64) { + V4DF_X_4(1, data + 0, 16, ws); + V4DF_X_4(1, data + 64, 16, ws); + V4DF_X_4(1, data + 96, 16, ws); + + V4DF_X_8(1, data, 64, ws + (p->ws_is[2] << 1)); + } else { + assert(N == 32); + V4DF_X_8(1, data, 32, ws + (p->ws_is[1] << 1)); + } +} +#endif + void ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) { @@ -1172,6 +1622,26 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) #endif } +#ifdef FFTS_DOUBLE +void +ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out) +{ + const double *din = (const double*) in; + double *dout = (double*) out; + + const size_t N = p->N; + const int N_log_2 = ffts_ctzl(N); + + if (N_log_2 & 1) { + ffts_static_firstpass_odd_64f(dout, din, p, 0); + } else { + ffts_static_firstpass_even_64f(dout, din, p, 0); + } + + ffts_static_rec_f_64f(p, dout, N); +} +#endif + void ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) { @@ -1231,4 +1701,24 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) ffts_static_rec_i_32f(p, dout, N); #endif -} \ No newline at end of file +} + +#ifdef FFTS_DOUBLE +void +ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out) +{ + const double *din = (const double*) in; + double *dout = (double*) out; + + const size_t N = p->N; + const int N_log_2 = ffts_ctzl(N); + + if (N_log_2 & 1) { + ffts_static_firstpass_odd_64f(dout, din, p, 1); + } else { + ffts_static_firstpass_even_64f(dout, din, p, 1); + } + + ffts_static_rec_i_64f(p, dout, N); +} +#endif \ No newline at end of file diff --git a/lib/ffts/src/ffts_static.h b/lib/ffts/src/ffts_static.h index 5a42fc2..5de0059 100644 --- a/lib/ffts/src/ffts_static.h +++ b/lib/ffts/src/ffts_static.h @@ -43,49 +43,73 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE void ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out); +#endif void ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE +void +ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out); +#endif + void ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out); +#ifdef FFTS_DOUBLE +void +ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out); +#endif + #endif /* FFTS_STATIC_H */ diff --git a/lib/ffts/src/ffts_trig.c b/lib/ffts/src/ffts_trig.c index 74ebfd2..65efa86 100644 --- a/lib/ffts/src/ffts_trig.c +++ b/lib/ffts/src/ffts_trig.c @@ -2,7 +2,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South -Copyright (c) 2015, Jukka Ojanen +Copyright (c) 2015-2016, Jukka Ojanen All rights reserved. @@ -33,193 +33,707 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts_trig.h" #include "ffts_dd.h" +/* +* For more information on algorithms: +* +* D. Potts, G. Steidl, M. Tasche, Numerical stability of fast +* trigonometric transforms — a worst case study, +* J. Concrete Appl. Math. 1 (2003) 1–36 +* +* O. Buneman, Stable on–line creation of sines and cosines of +* successive angles, Proc. IEEE 75, 1434 – 1435 (1987). +*/ + +/* An union to initialize doubles using byte presentation, +* and to avoid breaking strict-aliasing rules +*/ + +/* TODO: we need macros to take care endianess */ +typedef union ffts_double { + int32_t i[2]; + double d; +} ffts_double_t; + /* 1/(2*cos(pow(2,-p)*pi)) */ -static const FFTS_ALIGN(16) unsigned int half_secant[132] = { - 0x00000000, 0x3fe00000, 0xc9be45de, 0x3be3bd3c, - 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c03bd3c, - 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c23bd3c, - 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c43bd3c, - 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c63bd3c, - 0x00000000, 0x3fe00000, 0xc9be45df, 0x3c83bd3c, - 0x00000001, 0x3fe00000, 0x4df22efd, 0x3c7de9e6, - 0x00000005, 0x3fe00000, 0x906e8725, 0xbc60b0cd, - 0x00000014, 0x3fe00000, 0x906e8357, 0xbc80b0cd, - 0x0000004f, 0x3fe00000, 0x0dce83c9, 0xbc5619b2, - 0x0000013c, 0x3fe00000, 0x0dc6e79a, 0xbc7619b2, - 0x000004ef, 0x3fe00000, 0xe4af1240, 0x3c83cc9b, - 0x000013bd, 0x3fe00000, 0x2d14c08a, 0x3c7e64df, - 0x00004ef5, 0x3fe00000, 0x47a85465, 0xbc59b20b, - 0x00013bd4, 0x3fe00000, 0xab79c897, 0xbc79b203, - 0x0004ef4f, 0x3fe00000, 0x15019a96, 0x3c79386b, - 0x0013bd3d, 0x3fe00000, 0x7d6dbf4b, 0xbc7b16b7, - 0x004ef4f3, 0x3fe00000, 0xf30832e0, 0x3c741ee4, - 0x013bd3cd, 0x3fe00000, 0xd3bcd4bb, 0xbc83f41e, - 0x04ef4f34, 0x3fe00000, 0xdd75aebb, 0xbc82ef06, - 0x13bd3cde, 0x3fe00000, 0xb2b41b3d, 0x3c52d979, - 0x4ef4f46c, 0x3fe00000, 0x4f0fb458, 0xbc851db3, - 0x3bd3e0e7, 0x3fe00001, 0x8a0ce3f0, 0x3c58dbab, - 0xef507722, 0x3fe00004, 0x2a8ec295, 0x3c83e351, - 0xbd5114f9, 0x3fe00013, 0xc4c0d92d, 0x3c8b3ca4, - 0xf637de7d, 0x3fe0004e, 0xb74de729, 0x3c45974e, - 0xe8190891, 0x3fe0013b, 0x26edf4da, 0xbc814c20, - 0x9436640e, 0x3fe004f0, 0xe2b34b50, 0x3c8091ab, - 0x9c61d971, 0x3fe013d1, 0x6ce01b8e, 0x3c7f7df7, - 0xd17cba53, 0x3fe0503e, 0x74ad7633, 0xbc697609, - 0x7bdb3895, 0x3fe1517a, 0x82f9091b, 0xbc8008d1, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 +static const FFTS_ALIGN(16) ffts_double_t half_secant[66] = { + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3be3bd3c } }, + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c03bd3c } }, + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c23bd3c } }, + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c43bd3c } }, + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c63bd3c } }, + { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45df, 0x3c83bd3c } }, + { { 0x00000001, 0x3fe00000 } }, { { 0x4df22efd, 0x3c7de9e6 } }, + { { 0x00000005, 0x3fe00000 } }, { { 0x906e8725, 0xbc60b0cd } }, + { { 0x00000014, 0x3fe00000 } }, { { 0x906e8357, 0xbc80b0cd } }, + { { 0x0000004f, 0x3fe00000 } }, { { 0x0dce83c9, 0xbc5619b2 } }, + { { 0x0000013c, 0x3fe00000 } }, { { 0x0dc6e79a, 0xbc7619b2 } }, + { { 0x000004ef, 0x3fe00000 } }, { { 0xe4af1240, 0x3c83cc9b } }, + { { 0x000013bd, 0x3fe00000 } }, { { 0x2d14c08a, 0x3c7e64df } }, + { { 0x00004ef5, 0x3fe00000 } }, { { 0x47a85465, 0xbc59b20b } }, + { { 0x00013bd4, 0x3fe00000 } }, { { 0xab79c897, 0xbc79b203 } }, + { { 0x0004ef4f, 0x3fe00000 } }, { { 0x15019a96, 0x3c79386b } }, + { { 0x0013bd3d, 0x3fe00000 } }, { { 0x7d6dbf4b, 0xbc7b16b7 } }, + { { 0x004ef4f3, 0x3fe00000 } }, { { 0xf30832e0, 0x3c741ee4 } }, + { { 0x013bd3cd, 0x3fe00000 } }, { { 0xd3bcd4bb, 0xbc83f41e } }, + { { 0x04ef4f34, 0x3fe00000 } }, { { 0xdd75aebb, 0xbc82ef06 } }, + { { 0x13bd3cde, 0x3fe00000 } }, { { 0xb2b41b3d, 0x3c52d979 } }, + { { 0x4ef4f46c, 0x3fe00000 } }, { { 0x4f0fb458, 0xbc851db3 } }, + { { 0x3bd3e0e7, 0x3fe00001 } }, { { 0x8a0ce3f0, 0x3c58dbab } }, + { { 0xef507722, 0x3fe00004 } }, { { 0x2a8ec295, 0x3c83e351 } }, + { { 0xbd5114f9, 0x3fe00013 } }, { { 0xc4c0d92d, 0x3c8b3ca4 } }, + { { 0xf637de7d, 0x3fe0004e } }, { { 0xb74de729, 0x3c45974e } }, + { { 0xe8190891, 0x3fe0013b } }, { { 0x26edf4da, 0xbc814c20 } }, + { { 0x9436640e, 0x3fe004f0 } }, { { 0xe2b34b50, 0x3c8091ab } }, + { { 0x9c61d971, 0x3fe013d1 } }, { { 0x6ce01b8e, 0x3c7f7df7 } }, + { { 0xd17cba53, 0x3fe0503e } }, { { 0x74ad7633, 0xbc697609 } }, + { { 0x7bdb3895, 0x3fe1517a } }, { { 0x82f9091b, 0xbc8008d1 } }, + { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } } }; /* cos(pow(2,-p)*pi), sin(pow(2,-p)*pi) */ -static const FFTS_ALIGN(16) unsigned int cos_sin_pi_table[264] = { - 0x00000000, 0x3ff00000, 0x54442d18, 0x3df921fb, - 0xc9be45de, 0xbbf3bd3c, 0xbb77974f, 0x3a91a390, - 0x00000000, 0x3ff00000, 0x54442d18, 0x3e0921fb, - 0xc9be45de, 0xbc13bd3c, 0x54a14928, 0x3aa19bd0, - 0x00000000, 0x3ff00000, 0x54442d18, 0x3e1921fb, - 0xc9be45de, 0xbc33bd3c, 0xb948108a, 0x3ab17cce, - 0x00000000, 0x3ff00000, 0x54442d18, 0x3e2921fb, - 0xc9be45de, 0xbc53bd3c, 0x4be32e14, 0x3ac100c8, - 0x00000000, 0x3ff00000, 0x54442d18, 0x3e3921fb, - 0xc9be45de, 0xbc73bd3c, 0x2c9f4879, 0x3ace215d, - 0xffffffff, 0x3fefffff, 0x54442d18, 0x3e4921fb, - 0x6c837443, 0x3c888586, 0x0005f376, 0x3acd411f, - 0xfffffffe, 0x3fefffff, 0x54442d18, 0x3e5921fb, - 0x4df22ef1, 0xbc8de9e6, 0x9937209e, 0xbaf7b153, - 0xfffffff6, 0x3fefffff, 0x54442d16, 0x3e6921fb, - 0x906e88aa, 0x3c70b0cd, 0xfe19968a, 0xbb03b7c0, - 0xffffffd9, 0x3fefffff, 0x54442d0e, 0x3e7921fb, - 0xdf22ed26, 0xbc8e9e64, 0x8d1b6ffb, 0xbaee8bb4, - 0xffffff62, 0x3fefffff, 0x54442cef, 0x3e8921fb, - 0x0dd18f0f, 0x3c6619b2, 0x7f2b20fb, 0xbb00e133, - 0xfffffd88, 0x3fefffff, 0x54442c73, 0x3e9921fb, - 0x0dd314b2, 0x3c8619b2, 0x619fdf6e, 0xbb174e98, - 0xfffff621, 0x3fefffff, 0x54442a83, 0x3ea921fb, - 0x3764acf5, 0x3c8866c8, 0xf5b2407f, 0xbb388215, - 0xffffd886, 0x3fefffff, 0x544422c2, 0x3eb921fb, - 0x20e7a944, 0xbc8e64df, 0x7b9b9f23, 0x3b5a0961, - 0xffff6216, 0x3fefffff, 0x544403c1, 0x3ec921fb, - 0x52ee25ea, 0x3c69b20e, 0x4df6a86a, 0xbb5999d9, - 0xfffd8858, 0x3fefffff, 0x544387ba, 0x3ed921fb, - 0xd8910ead, 0x3c89b20f, 0x0809d04d, 0x3b77d9db, - 0xfff62162, 0x3fefffff, 0x544197a1, 0x3ee921fb, - 0x438d3925, 0xbc8937a8, 0xa5d27f7a, 0xbb858b02, - 0xffd88586, 0x3fefffff, 0x5439d73a, 0x3ef921fb, - 0x94b3ddd2, 0x3c8b22e4, 0xf8a3b73d, 0xbb863c7f, - 0xff62161a, 0x3fefffff, 0x541ad59e, 0x3f0921fb, - 0x7ea469b2, 0xbc835c13, 0xb8cee262, 0x3bae9860, - 0xfd885867, 0x3fefffff, 0x539ecf31, 0x3f1921fb, - 0x23a32e63, 0xbc77d556, 0xfcd23a30, 0x3b96b111, - 0xf621619c, 0x3fefffff, 0x51aeb57c, 0x3f2921fb, - 0xbbbd8fe6, 0xbc87507d, 0x4916c435, 0xbbca6e1d, - 0xd8858675, 0x3fefffff, 0x49ee4ea6, 0x3f3921fb, - 0x54748eab, 0xbc879f0e, 0x744a453e, 0x3bde894d, - 0x62161a34, 0x3fefffff, 0x2aecb360, 0x3f4921fb, - 0xb1f9b9c4, 0xbc6136dc, 0x7e566b4c, 0x3be87615, - 0x88586ee6, 0x3feffffd, 0xaee6472e, 0x3f5921fa, - 0xf173ae5b, 0x3c81af64, 0x284a9df8, 0xbbfee52e, - 0x21621d02, 0x3feffff6, 0xbecca4ba, 0x3f6921f8, - 0xebc82813, 0xbc76acfc, 0x7bcab5b2, 0x3c02ba40, - 0x858e8a92, 0x3fefffd8, 0xfe670071, 0x3f7921f0, - 0x1883bcf7, 0x3c8359c7, 0xfe6b7a9b, 0x3bfab967, - 0x169b92db, 0x3fefff62, 0xfcdec784, 0x3f8921d1, - 0xc81fbd0d, 0x3c85dda3, 0xbe836d9d, 0x3c29878e, - 0x6084cd0d, 0x3feffd88, 0xf7a3667e, 0x3f992155, - 0x4556e4cb, 0xbc81354d, 0x091a0130, 0xbbfb1d63, - 0xe3796d7e, 0x3feff621, 0xf10dd814, 0x3fa91f65, - 0x2e24aa15, 0xbc6c57bc, 0x0d569a90, 0xbc2912bd, - 0xa3d12526, 0x3fefd88d, 0xbc29b42c, 0x3fb917a6, - 0x378811c7, 0xbc887df6, 0xd26ed688, 0xbc3e2718, - 0xcff75cb0, 0x3fef6297, 0x3c69a60b, 0x3fc8f8b8, - 0x2a361fd3, 0x3c756217, 0xb9ff8d82, 0xbc626d19, - 0xcf328d46, 0x3fed906b, 0xa6aea963, 0x3fd87de2, - 0x10231ac2, 0x3c7457e6, 0xd3d5a610, 0xbc672ced, - 0x667f3bcd, 0x3fe6a09e, 0x667f3bcd, 0x3fe6a09e, - 0x13b26456, 0xbc8bdd34, 0x13b26456, 0xbc8bdd34, - 0x00000000, 0x00000000, 0x00000000, 0x3ff00000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 +static const FFTS_ALIGN(32) ffts_double_t cos_sin_pi_table[132] = { + { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3df921fb } }, + { { 0xc9be45de, 0xbbf3bd3c } }, { { 0xbb77974f, 0x3a91a390 } }, + { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e0921fb } }, + { { 0xc9be45de, 0xbc13bd3c } }, { { 0x54a14928, 0x3aa19bd0 } }, + { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e1921fb } }, + { { 0xc9be45de, 0xbc33bd3c } }, { { 0xb948108a, 0x3ab17cce } }, + { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e2921fb } }, + { { 0xc9be45de, 0xbc53bd3c } }, { { 0x4be32e14, 0x3ac100c8 } }, + { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e3921fb } }, + { { 0xc9be45de, 0xbc73bd3c } }, { { 0x2c9f4879, 0x3ace215d } }, + { { 0xffffffff, 0x3fefffff } }, { { 0x54442d18, 0x3e4921fb } }, + { { 0x6c837443, 0x3c888586 } }, { { 0x0005f376, 0x3acd411f } }, + { { 0xfffffffe, 0x3fefffff } }, { { 0x54442d18, 0x3e5921fb } }, + { { 0x4df22ef1, 0xbc8de9e6 } }, { { 0x9937209e, 0xbaf7b153 } }, + { { 0xfffffff6, 0x3fefffff } }, { { 0x54442d16, 0x3e6921fb } }, + { { 0x906e88aa, 0x3c70b0cd } }, { { 0xfe19968a, 0xbb03b7c0 } }, + { { 0xffffffd9, 0x3fefffff } }, { { 0x54442d0e, 0x3e7921fb } }, + { { 0xdf22ed26, 0xbc8e9e64 } }, { { 0x8d1b6ffb, 0xbaee8bb4 } }, + { { 0xffffff62, 0x3fefffff } }, { { 0x54442cef, 0x3e8921fb } }, + { { 0x0dd18f0f, 0x3c6619b2 } }, { { 0x7f2b20fb, 0xbb00e133 } }, + { { 0xfffffd88, 0x3fefffff } }, { { 0x54442c73, 0x3e9921fb } }, + { { 0x0dd314b2, 0x3c8619b2 } }, { { 0x619fdf6e, 0xbb174e98 } }, + { { 0xfffff621, 0x3fefffff } }, { { 0x54442a83, 0x3ea921fb } }, + { { 0x3764acf5, 0x3c8866c8 } }, { { 0xf5b2407f, 0xbb388215 } }, + { { 0xffffd886, 0x3fefffff } }, { { 0x544422c2, 0x3eb921fb } }, + { { 0x20e7a944, 0xbc8e64df } }, { { 0x7b9b9f23, 0x3b5a0961 } }, + { { 0xffff6216, 0x3fefffff } }, { { 0x544403c1, 0x3ec921fb } }, + { { 0x52ee25ea, 0x3c69b20e } }, { { 0x4df6a86a, 0xbb5999d9 } }, + { { 0xfffd8858, 0x3fefffff } }, { { 0x544387ba, 0x3ed921fb } }, + { { 0xd8910ead, 0x3c89b20f } }, { { 0x0809d04d, 0x3b77d9db } }, + { { 0xfff62162, 0x3fefffff } }, { { 0x544197a1, 0x3ee921fb } }, + { { 0x438d3925, 0xbc8937a8 } }, { { 0xa5d27f7a, 0xbb858b02 } }, + { { 0xffd88586, 0x3fefffff } }, { { 0x5439d73a, 0x3ef921fb } }, + { { 0x94b3ddd2, 0x3c8b22e4 } }, { { 0xf8a3b73d, 0xbb863c7f } }, + { { 0xff62161a, 0x3fefffff } }, { { 0x541ad59e, 0x3f0921fb } }, + { { 0x7ea469b2, 0xbc835c13 } }, { { 0xb8cee262, 0x3bae9860 } }, + { { 0xfd885867, 0x3fefffff } }, { { 0x539ecf31, 0x3f1921fb } }, + { { 0x23a32e63, 0xbc77d556 } }, { { 0xfcd23a30, 0x3b96b111 } }, + { { 0xf621619c, 0x3fefffff } }, { { 0x51aeb57c, 0x3f2921fb } }, + { { 0xbbbd8fe6, 0xbc87507d } }, { { 0x4916c435, 0xbbca6e1d } }, + { { 0xd8858675, 0x3fefffff } }, { { 0x49ee4ea6, 0x3f3921fb } }, + { { 0x54748eab, 0xbc879f0e } }, { { 0x744a453e, 0x3bde894d } }, + { { 0x62161a34, 0x3fefffff } }, { { 0x2aecb360, 0x3f4921fb } }, + { { 0xb1f9b9c4, 0xbc6136dc } }, { { 0x7e566b4c, 0x3be87615 } }, + { { 0x88586ee6, 0x3feffffd } }, { { 0xaee6472e, 0x3f5921fa } }, + { { 0xf173ae5b, 0x3c81af64 } }, { { 0x284a9df8, 0xbbfee52e } }, + { { 0x21621d02, 0x3feffff6 } }, { { 0xbecca4ba, 0x3f6921f8 } }, + { { 0xebc82813, 0xbc76acfc } }, { { 0x7bcab5b2, 0x3c02ba40 } }, + { { 0x858e8a92, 0x3fefffd8 } }, { { 0xfe670071, 0x3f7921f0 } }, + { { 0x1883bcf7, 0x3c8359c7 } }, { { 0xfe6b7a9b, 0x3bfab967 } }, + { { 0x169b92db, 0x3fefff62 } }, { { 0xfcdec784, 0x3f8921d1 } }, + { { 0xc81fbd0d, 0x3c85dda3 } }, { { 0xbe836d9d, 0x3c29878e } }, + { { 0x6084cd0d, 0x3feffd88 } }, { { 0xf7a3667e, 0x3f992155 } }, + { { 0x4556e4cb, 0xbc81354d } }, { { 0x091a0130, 0xbbfb1d63 } }, + { { 0xe3796d7e, 0x3feff621 } }, { { 0xf10dd814, 0x3fa91f65 } }, + { { 0x2e24aa15, 0xbc6c57bc } }, { { 0x0d569a90, 0xbc2912bd } }, + { { 0xa3d12526, 0x3fefd88d } }, { { 0xbc29b42c, 0x3fb917a6 } }, + { { 0x378811c7, 0xbc887df6 } }, { { 0xd26ed688, 0xbc3e2718 } }, + { { 0xcff75cb0, 0x3fef6297 } }, { { 0x3c69a60b, 0x3fc8f8b8 } }, + { { 0x2a361fd3, 0x3c756217 } }, { { 0xb9ff8d82, 0xbc626d19 } }, + { { 0xcf328d46, 0x3fed906b } }, { { 0xa6aea963, 0x3fd87de2 } }, + { { 0x10231ac2, 0x3c7457e6 } }, { { 0xd3d5a610, 0xbc672ced } }, + { { 0x667f3bcd, 0x3fe6a09e } }, { { 0x667f3bcd, 0x3fe6a09e } }, + { { 0x13b26456, 0xbc8bdd34 } }, { { 0x13b26456, 0xbc8bdd34 } }, + { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x3ff00000 } }, + { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } } +}; + +#define COS_SIN_TABLE_SIZE 260 + +/* cos(pi*k/256), sin(pi*k/256) */ +static const FFTS_ALIGN(32) ffts_double_t cos_sin_table[COS_SIN_TABLE_SIZE] = { + { { 0x00000000, 0x3FF00000 } }, { { 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }, + { { 0x169B92DB, 0x3FEFFF62 } }, { { 0xFCDEC784, 0x3F8921D1 } }, + { { 0xC81FBD0D, 0x3C85DDA3 } }, { { 0xBE836D9D, 0x3C29878E } }, + { { 0x6084CD0D, 0x3FEFFD88 } }, { { 0xF7A3667E, 0x3F992155 } }, + { { 0x4556E4CB, 0xBC81354D } }, { { 0x091A0130, 0xBBFB1D63 } }, + { { 0xEFFEF75D, 0x3FEFFA72 } }, { { 0x759455CD, 0x3FA2D865 } }, + { { 0xCDB25956, 0xBC88B4CD } }, { { 0x5BA93AC0, 0x3C2686F6 } }, + { { 0xE3796D7E, 0x3FEFF621 } }, { { 0xF10DD814, 0x3FA91F65 } }, + { { 0x2E24AA15, 0xBC6C57BC } }, { { 0x0D569A90, 0xBC2912BD } }, + { { 0x658E71AD, 0x3FEFF095 } }, { { 0x79F820E0, 0x3FAF656E } }, + { { 0xE18A4B9E, 0x3C801A8C } }, { { 0xE392BFFE, 0xBC22E1EB } }, + { { 0xAD01883A, 0x3FEFE9CD } }, { { 0x92CE19F6, 0x3FB2D520 } }, + { { 0xD0C67E35, 0x3C6521EC } }, { { 0xA8BF6B2C, 0xBC49A088 } }, + { { 0xFCBD5B09, 0x3FEFE1CA } }, { { 0x0A9AA419, 0x3FB5F6D0 } }, + { { 0x202A884E, 0x3C6A23E3 } }, { { 0xD03F6C9A, 0xBC4F4022 } }, + { { 0xA3D12526, 0x3FEFD88D } }, { { 0xBC29B42C, 0x3FB917A6 } }, + { { 0x378811C7, 0xBC887DF6 } }, { { 0xD26ED688, 0xBC3E2718 } }, + { { 0xFD6DA67B, 0x3FEFCE15 } }, { { 0xC79EC2D5, 0x3FBC3785 } }, + { { 0x830D4C09, 0xBC75DD6F } }, { { 0xF133FB21, 0xBC24F39D } }, + { { 0x70E19FD3, 0x3FEFC264 } }, { { 0x56A9730E, 0x3FBF564E } }, + { { 0x68ECACEE, 0x3C81EC86 } }, { { 0x729AE56D, 0x3C4A2704 } }, + { { 0x7195D741, 0x3FEFB579 } }, { { 0xCEDAF577, 0x3FC139F0 } }, + { { 0x7397CC08, 0x3C71BFAC } }, { { 0x4D1B3CFA, 0xBC652343 } }, + { { 0x7F08A517, 0x3FEFA755 } }, { { 0x6E8E613A, 0x3FC2C810 } }, + { { 0xCA13571F, 0xBC87A0A8 } }, { { 0xA89A11E0, 0x3C513000 } }, + { { 0x24C9099B, 0x3FEF97F9 } }, { { 0xB1293E5A, 0x3FC45576 } }, + { { 0xEEA5963B, 0xBC8E2AE0 } }, { { 0x4119F7B1, 0xBC5285A2 } }, + { { 0xFA714BA9, 0x3FEF8764 } }, { { 0x448B3FC6, 0x3FC5E214 } }, + { { 0x778FFCB6, 0x3C7AB256 } }, { { 0x779DDAC6, 0x3C6531FF } }, + { { 0xA3A12077, 0x3FEF7599 } }, { { 0xDE50BF31, 0x3FC76DD9 } }, + { { 0xD743195C, 0x3C884F31 } }, { { 0xEC501B2F, 0x3C61D5EE } }, + { { 0xCFF75CB0, 0x3FEF6297 } }, { { 0x3C69A60B, 0x3FC8F8B8 } }, + { { 0x2A361FD3, 0x3C756217 } }, { { 0xB9FF8D82, 0xBC626D19 } }, + { { 0x3B0B2F2D, 0x3FEF4E60 } }, { { 0x25B00451, 0x3FCA82A0 } }, + { { 0xE695AC05, 0xBC78EE01 } }, { { 0xFFD084AD, 0xBC687905 } }, + { { 0xAC64E589, 0x3FEF38F3 } }, { { 0x6A7E4F63, 0x3FCC0B82 } }, + { { 0xB51F72E6, 0xBC7D7BAF } }, { { 0x9E521935, 0xBC1AF143 } }, + { { 0xF7763ADA, 0x3FEF2252 } }, { { 0xE5454311, 0x3FCD934F } }, + { { 0x1C8D94AB, 0xBC820CB8 } }, { { 0x277107AD, 0x3C675B92 } }, + { { 0xFB9230D7, 0x3FEF0A7E } }, { { 0x7B215F1B, 0x3FCF19F9 } }, + { { 0xDC6B4989, 0x3C752C7A } }, { { 0xF11DA2C4, 0xBC642DEE } }, + { { 0xA3E473C2, 0x3FEEF178 } }, { { 0x0E37FDAE, 0x3FD04FB8 } }, + { { 0x67FE774F, 0x3C86310A } }, { { 0xB72583CC, 0xBC0412CD } }, + { { 0xE7684963, 0x3FEED740 } }, { { 0x62B1F677, 0x3FD111D2 } }, + { { 0x91F59CC2, 0x3C7E82C7 } }, { { 0x0AB7AA9A, 0x3C7824C2 } }, + { { 0xC8DF0B74, 0x3FEEBBD8 } }, { { 0x3F4CDB3E, 0x3FD1D344 } }, + { { 0x615E7277, 0x3C7C6C8C } }, { { 0x1C13519E, 0xBC6720D4 } }, + { { 0x56C62DDA, 0x3FEE9F41 } }, { { 0x2ED59F06, 0x3FD29406 } }, + { { 0xE2E3F81E, 0x3C8760B1 } }, { { 0xA2C4612D, 0xBC75D28D } }, + { { 0xAB4CD10D, 0x3FEE817B } }, { { 0xC2E18152, 0x3FD35410 } }, + { { 0x686B5E0A, 0xBC7D0AFE } }, { { 0x2F96E062, 0xBC73CB00 } }, + { { 0xEC48E112, 0x3FEE6288 } }, { { 0x94176601, 0x3FD4135C } }, + { { 0xF2847754, 0xBC616B56 } }, { { 0x4AFA2518, 0x3C70C97C } }, + { { 0x4B2BC17E, 0x3FEE426A } }, { { 0x4278E76A, 0x3FD4D1E2 } }, + { { 0x89744882, 0x3C8A8738 } }, { { 0x18792858, 0x3C624172 } }, + { { 0x04F686E5, 0x3FEE2121 } }, { { 0x75AB1FDD, 0x3FD58F9A } }, + { { 0x6C126527, 0xBC8014C7 } }, { { 0xD58CF620, 0xBC1EFDC0 } }, + { { 0x622DBE2B, 0x3FEDFEAE } }, { { 0xDD3F27C6, 0x3FD64C7D } }, + { { 0x88425567, 0xBC8514EA } }, { { 0x4A664121, 0x3C510D2B } }, + { { 0xB6CCC23C, 0x3FEDDB13 } }, { { 0x30FA459F, 0x3FD70885 } }, + { { 0xC6107DB3, 0x3C883C37 } }, { { 0xE0864C5D, 0xBC744B19 } }, + { { 0x6238A09B, 0x3FEDB652 } }, { { 0x311DCCE7, 0x3FD7C3A9 } }, + { { 0xEAE69460, 0xBC7ADEE7 } }, { { 0x1EF3E8D9, 0x3C19A3F2 } }, + { { 0xCF328D46, 0x3FED906B } }, { { 0xA6AEA963, 0x3FD87DE2 } }, + { { 0x10231AC2, 0x3C7457E6 } }, { { 0xD3D5A610, 0xBC672CED } }, + { { 0x73C9E68B, 0x3FED6961 } }, { { 0x63BC93D7, 0x3FD9372A } }, + { { 0xC6393D55, 0xBC7E8C61 } }, { { 0x9E5AD5B1, 0x3C668431 } }, + { { 0xD14DC93A, 0x3FED4134 } }, { { 0x43A8ED8A, 0x3FD9EF79 } }, + { { 0x95D25AF2, 0xBC84EF52 } }, { { 0x290BDBAB, 0x3C66DA81 } }, + { { 0x743E35DC, 0x3FED17E7 } }, { { 0x2B6D3FCA, 0x3FDAA6C8 } }, + { { 0x3540130A, 0xBC5101DA } }, { { 0x6EE5CCF7, 0xBC7D5F10 } }, + { { 0xF43CC773, 0x3FECED7A } }, { { 0x09E15CC0, 0x3FDB5D10 } }, + { { 0xB5AB58AE, 0xBC5E7B6B } }, { { 0xCB974183, 0x3C65B362 } }, + { { 0xF3FCFC5C, 0x3FECC1F0 } }, { { 0xD8011EE7, 0x3FDC1249 } }, + { { 0x3B68F6AB, 0x3C7E5761 } }, { { 0xBB515206, 0xBC7813AA } }, + { { 0x213411F5, 0x3FEC954B } }, { { 0x9931C45E, 0x3FDCC66E } }, + { { 0x1E946603, 0xBC52FB76 } }, { { 0x59C37F8F, 0x3C56850E } }, + { { 0x3488739B, 0x3FEC678B } }, { { 0x5B86E389, 0x3FDD7977 } }, + { { 0xC7C5FF5B, 0x3C6D86CA } }, { { 0x87BC0575, 0x3C7550EC } }, + { { 0xF180BDB1, 0x3FEC38B2 } }, { { 0x3806F63B, 0x3FDE2B5D } }, + { { 0x757C8D07, 0xBC76E0B1 } }, { { 0x1D3C6841, 0x3C5E0D89 } }, + { { 0x26725549, 0x3FEC08C4 } }, { { 0x52EF78D6, 0x3FDEDC19 } }, + { { 0xD80E2946, 0x3C5B157F } }, { { 0xC33EDEE6, 0xBC7DD0F7 } }, + { { 0xAC6F952A, 0x3FEBD7C0 } }, { { 0xDBF89ABA, 0x3FDF8BA4 } }, + { { 0x32AC700A, 0xBC8825A7 } }, { { 0xC1B776B8, 0xBC32EC1F } }, + { { 0x673590D2, 0x3FEBA5AA } }, { { 0x874C3EB7, 0x3FE01CFC } }, + { { 0x370753B6, 0x3C87EA4E } }, { { 0xE7C2368C, 0xBC734A35 } }, + { { 0x45196E3E, 0x3FEB7283 } }, { { 0x9922FFEE, 0x3FE07387 } }, + { { 0x324E6D61, 0xBC8BC69F } }, { { 0x4347406C, 0xBC8A5A01 } }, + { { 0x3EF55712, 0x3FEB3E4D } }, { { 0x4D5D898F, 0x3FE0C970 } }, + { { 0xBF11A493, 0xBC8EB6B8 } }, { { 0xDE6EE9B2, 0xBC88D3D7 } }, + { { 0x58150200, 0x3FEB090A } }, { { 0x541B4B23, 0x3FE11EB3 } }, + { { 0x300FFCCE, 0xBC8926DA } }, { { 0x69ABE4F1, 0xBC8EF23B } }, + { { 0x9E21D511, 0x3FEAD2BC } }, { { 0x63DEDB49, 0x3FE1734D } }, + { { 0x07BEA548, 0xBC847FBE } }, { { 0xCCC50575, 0xBC87EEF2 } }, + { { 0x290EA1A3, 0x3FEA9B66 } }, { { 0x39AE68C8, 0x3FE1C73B } }, + { { 0xE8B6DAC8, 0x3C39F630 } }, { { 0x267F6600, 0x3C8B25DD } }, + { { 0x1B02FAE2, 0x3FEA6309 } }, { { 0x9933EB59, 0x3FE21A79 } }, + { { 0x52248D10, 0xBC7E9111 } }, { { 0x77C68FB2, 0xBC83A7B1 } }, + { { 0xA0462782, 0x3FEA29A7 } }, { { 0x4CDD12DF, 0x3FE26D05 } }, + { { 0x015DF175, 0xBC7128BB } }, { { 0x3EF3770C, 0xBC85DA74 } }, + { { 0xEF29AF94, 0x3FE9EF43 } }, { { 0x25FAF3EA, 0x3FE2BEDB } }, + { { 0xB60445C2, 0x3C7B1DFC } }, { { 0xC796EE46, 0xBC514981 } }, + { { 0x47F38741, 0x3FE9B3E0 } }, { { 0xFCE17035, 0x3FE30FF7 } }, + { { 0x86712474, 0xBC830EE2 } }, { { 0x26F74A6F, 0xBC6EFCC6 } }, + { { 0xF4C7D742, 0x3FE9777E } }, { { 0xB10659F3, 0x3FE36058 } }, + { { 0xA240665E, 0xBC815479 } }, { { 0xA35857E7, 0xBC81FCB3 } }, + { { 0x499263FB, 0x3FE93A22 } }, { { 0x292050B9, 0x3FE3AFFA } }, + { { 0xA920DF0B, 0x3C83D419 } }, { { 0xE3954964, 0x3C7E3E25 } }, + { { 0xA3EF940D, 0x3FE8FBCC } }, { { 0x534556D4, 0x3FE3FED9 } }, + { { 0x9C86F2F1, 0xBC66DFA9 } }, { { 0x608C5061, 0x3C836916 } }, + { { 0x6B151741, 0x3FE8BC80 } }, { { 0x25091DD6, 0x3FE44CF3 } }, + { { 0x2ED1336D, 0xBC82C5E1 } }, { { 0x2CFDC6B3, 0x3C68076A } }, + { { 0x0FBA2EBF, 0x3FE87C40 } }, { { 0x9B9B0939, 0x3FE49A44 } }, + { { 0x0C3F64CD, 0xBC82DABC } }, { { 0x6D719B94, 0xBC827EE1 } }, + { { 0x0BFF976E, 0x3FE83B0E } }, { { 0xBBE3E5E9, 0x3FE4E6CA } }, + { { 0xF8EA3475, 0xBC76F420 } }, { { 0xEDCEB327, 0x3C63C293 } }, + { { 0xE3571771, 0x3FE7F8EC } }, { { 0x92A35596, 0x3FE53282 } }, + { { 0xCE93C917, 0xBC89C8D8 } }, { { 0x89DA0257, 0xBC7A12EB } }, + { { 0x226AAFAF, 0x3FE7B5DF } }, { { 0x348CECA0, 0x3FE57D69 } }, + { { 0xACDF0AD7, 0xBC70F537 } }, { { 0x992BFBB2, 0xBC875720 } }, + { { 0x5F037261, 0x3FE771E7 } }, { { 0xBE65018C, 0x3FE5C77B } }, + { { 0x8D84068F, 0x3C75CFCE } }, { { 0x9C0BC32A, 0x3C8069EA } }, + { { 0x37EFFF96, 0x3FE72D08 } }, { { 0x551D2CDF, 0x3FE610B7 } }, + { { 0x0F1D915C, 0x3C80D4EF } }, { { 0x52FF2A37, 0xBC7251B3 } }, + { { 0x54EAA8AF, 0x3FE6E744 } }, { { 0x25F0783D, 0x3FE65919 } }, + { { 0xC84E226E, 0xBC8DBC03 } }, { { 0xFBF5DE23, 0x3C8C3D64 } }, + { { 0x667F3BCD, 0x3FE6A09E } }, { { 0x667F3BCD, 0x3FE6A09E } }, + { { 0x13B26456, 0xBC8BDD34 } }, { { 0x13B26456, 0xBC8BDD34 } } }; +/* cos(pi * x), x=[0;1/512] */ +static const FFTS_ALIGN(16) ffts_double_t cos_coeff[3] = { + { { 0xC9BE45DE, 0xC013BD3C } }, + { { 0x081749FA, 0x40103C1F } }, + { { 0x047EE98B, 0xBFF55D10 } } +}; + +/* sin(pi * x), x=[0;1/512] */ +static const FFTS_ALIGN(16) ffts_double_t sin_coeff[4] = { + { { 0x54442D18, 0x400921FB } }, + { { 0xE62154CA, 0xC014ABBC } }, + { { 0xCEF16BFE, 0x40046675 } }, + { { 0xADE54A87, 0x40339228 } } +}; + +#ifndef M_1_256 +#define M_1_256 3.90625e-3 +#endif + +static int +ffts_cexp_32f64f(size_t n, size_t d, double *out); + +/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */ int -ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size) +ffts_cexp_32f(size_t n, size_t d, float *output) { - double alpha, beta; - double c[2], s[2]; - double x, z; - int i; + double FFTS_ALIGN(16) z[2]; - if (!table || !table_size) { + if (!d || !output) return -1; + + /* reduction */ + if (FFTS_UNLIKELY(n >= d)) + n %= d; + + ffts_cexp_32f64f(n, d, z); + + output[0] = (float) z[0]; + output[1] = (float) z[1]; + return 0; +} + +/* used as intermediate result for single precision calculations */ +static int +ffts_cexp_32f64f(size_t n, size_t d, double *output) +{ + const ffts_double_t *ct = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_32(cos_sin_table); + const ffts_double_t *cc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(cos_coeff); + const ffts_double_t *sc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(sin_coeff); + double *out = FFTS_ASSUME_ALIGNED_16(output); + double c, s, cos_a, cos_b, sin_a, sin_b; + double cos_sign, sin_sign, sign, x, z; + int i, j, swap; + + /* we know this */ + FFTS_ASSUME(d > 0); + FFTS_ASSUME(n < d); + + /* determinate octant */ + if (n > d - n) { + sin_sign = -1.0; + n = d - n; + } else { + sin_sign = 1.0; } - /* the first */ - table[0][0] = 1.0f; - table[0][1] = -0.0f; + n <<= 1; + if (n > d - n) { + cos_sign = -1.0; + swap = 1; + n += n - d; + } else { + cos_sign = 1.0; + swap = 0; + n <<= 1; + } - if (FFTS_UNLIKELY(table_size == 1)) { - goto exit; + if (n > d - n) { + swap ^= 1; + n = d - n; } - if (FFTS_UNLIKELY(table_size == 2)) { - /* skip over */ - i = 1; - goto mid_point; + /* "binary long division" */ + for (i = 0, j = (1 << 5), n <<= 1; j && n; j >>= 1) { + if (n > d - n) { + n += n - d; + i += j; + } else { + n <<= 1; + } + } + + /* decide between two table values */ + if (n > d - n) { + i++; + n = d - n; + sign = -1.0; + } else { + sign = 1.0; } - /* polynomial approximations calculated using Sollya */ - x = 1.0 / table_size; + /* divide by 256 is exact (as is the multiply with its reciprocal) */ + x = ((double) n / d) * M_1_256; + + /* 0 <= x <= 1/512 */ z = x * x; - /* alpha = 2 * sin(M_PI_4 / m) * sin(M_PI_4 / m) */ - alpha = x * (1.1107207345394952717884501203293686870741139540138 + - z * (-0.114191397993514079911985272577099412137126013186879 + - z * 3.52164670852685621720746817665316575239342815885835e-3)); - alpha = alpha * alpha; + /* table lookup */ + cos_a = ct[4 * i + 0].d; + sin_a = ct[4 * i + 2].d; - /* beta = sin(M_PI_2 / m) */ - beta = x * (1.57079632679489455959753740899031981825828552246094 + - z * (-0.64596409735041482313988581154262647032737731933593 + - z * 7.9690915468332887416913479228242067620158195495605e-2)); + /* evaluate polynomials */ + cos_b = 1.0 + z * (cc[0].d + z * (cc[1].d + z * cc[2].d)); + sin_b = x * (sc[0].d + z * (sc[1].d + z * (sc[2].d + z * sc[3].d))); - /* cos(0) = 1.0, sin(0) = 0.0 */ - c[0] = 1.0; - s[0] = 0.0; + /* sum or difference of angles */ + c = cos_a * cos_b - sign * sin_a * sin_b; + s = sin_a * cos_b + sign * cos_a * sin_b; + + if (swap) { + double tmp = c; + c = s; + s = tmp; + } + + out[0] = cos_sign * c; + out[1] = sin_sign * s; + return 0; +} + +int +ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size) +{ + ffts_cpx_32f *lut; + size_t i, j, n; - /* generate sine and cosine tables with maximum error less than 1 ULP */ - for (i = 1; i < (table_size + 1)/2; i++) { - c[1] = c[0] - ((alpha * c[0]) + (beta * s[0])); - s[1] = s[0] - ((alpha * s[0]) - (beta * c[0])); + if (!table || !table_size) + return -1; + + n = 2 * table_size; + lut = ffts_aligned_malloc(n * sizeof(*lut)); + if (!lut) + return -1; - table[i + 0][0] = (float) c[1]; - table[i + 0][1] = (float) -s[1]; - table[table_size - i][0] = (float) s[1]; - table[table_size - i][1] = (float) -c[1]; + /* initialize LUT */ + ffts_generate_cosine_sine_32f(lut, n); - c[0] = c[1]; - s[0] = s[1]; + /* generate CZT sequence */ + for (i = 0, j = 0; i < table_size; ++i) { + table[i][0] = lut[j][0]; + table[i][1] = lut[j][1]; + + j += 2 * i + 1; + if (j >= n) + j -= n; } - if (FFTS_UNLIKELY(table_size & 1)) { + ffts_aligned_free(lut); + return 0; +} + +/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP +* using repeated subvector scaling algorithm, 16 - 20 times faster than +* direct library calling algorithm. +*/ +int +ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size) +{ + ffts_cpx_64f *const tmp = (ffts_cpx_64f *const) table; + double FFTS_ALIGN(16) z[2], zz[2], x[2], xx[2]; + size_t i, j, k, len; + + if (!table || !table_size) + return -1; + + if (FFTS_UNLIKELY(table_size == 1)) goto exit; - } -mid_point: - table[i][0] = 0.70710677f; - table[i][1] = -0.70710677f; + /* check if table size is a power of two */ + if (!(table_size & (table_size - 1))) + return ffts_generate_cosine_sine_pow2_32f(table, table_size); + + if (!(table_size & 1)) { + /* even table size -- check if multiply of four */ + if (!(table_size & 3)) { + /* multiply of four */ + len = table_size >> 2; + for (j = 1; 4 * j <= len; j <<= 1) { + ffts_cexp_32f64f(j, table_size, z); + + tmp[j][0] = z[0]; + tmp[j][1] = z[1]; + + tmp[len - j][0] = z[1]; + tmp[len - j][1] = z[0]; + + for (i = 1; i < j; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + tmp[j + i][0] = zz[0]; + tmp[j + i][1] = zz[1]; + + tmp[len - j - i][0] = zz[1]; + tmp[len - j - i][1] = zz[0]; + } + } + + /* this loops zero or one times */ + for (k = j << 1; k <= len; j <<= 1) { + ffts_cexp_32f64f(j, table_size, z); + + tmp[j][0] = z[0]; + tmp[j][1] = z[1]; + if (k++ == len) + break; + + tmp[len - j][0] = z[1]; + tmp[len - j][1] = z[0]; + if (k++ == len) + break; + + for (i = 1; i < j; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + tmp[j + i][0] = zz[0]; + tmp[j + i][1] = zz[1]; + if (k++ == len) + break; + + tmp[len - j - i][0] = zz[1]; + tmp[len - j - i][1] = zz[0]; + if (k++ == len) + break; + } + } + + /* convert doubles to floats */ + for (i = 1; i < len; i++) { + table[i][0] = (float) tmp[i][0]; + table[i][1] = (float) tmp[i][1]; + } + + table[len][0] = 0.0f; + table[len][1] = 1.0f; + + for (i = 1; i <= len; i++) { + table[len + i][0] = -table[i][1]; + table[len + i][1] = table[i][0]; + } + } else { + /* multiply of two */ + len = table_size >> 1; + for (j = 1; 4 * j <= len; j <<= 1) { + ffts_cexp_32f64f(j, table_size, z); + + tmp[j][0] = z[0]; + tmp[j][1] = z[1]; + + tmp[len - j][0] = -z[0]; + tmp[len - j][1] = z[1]; + + for (i = 1; i < j; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + tmp[j + i][0] = zz[0]; + tmp[j + i][1] = zz[1]; + + tmp[len - j - i][0] = -zz[0]; + tmp[len - j - i][1] = zz[1]; + } + } + + /* this loops zero or one times */ + for (k = j << 1; k <= len; j <<= 1) { + ffts_cexp_32f64f(j, table_size, z); + + tmp[j][0] = z[0]; + tmp[j][1] = z[1]; + if (k++ == len) + break; + + tmp[len - j][0] = -z[0]; + tmp[len - j][1] = z[1]; + if (k++ == len) + break; + + for (i = 1; i < j; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + tmp[j + i][0] = zz[0]; + tmp[j + i][1] = zz[1]; + if (k++ == len) + break; + + tmp[len - j - i][0] = -zz[0]; + tmp[len - j - i][1] = zz[1]; + if (k++ == len) + break; + } + } + + /* convert doubles to floats */ + for (i = 1; i < len; i++) { + table[i][0] = (float) tmp[i][0]; + table[i][1] = (float) tmp[i][1]; + } + + table[len][0] = -1.0f; + table[len][1] = 0.0f; + } + + /* duplicate lower half to higher */ + len = table_size >> 1; + for (i = 1; i < len; i++) { + table[table_size - i][0] = table[i][0]; + table[table_size - i][1] = -table[i][1]; + } + } else { + /* odd table size */ + + /* to avoid using temporary tables, generate the first 1/8 of table in + * double precision on lower half (and using the symmetry store + * the last 1/8 of table in single precision on higher half) + */ + for (j = 1; 8 * j < table_size; j <<= 1) { + ffts_cexp_32f64f(j, table_size, z); + + /* store double precision to lower half */ + tmp[j][0] = z[0]; + tmp[j][1] = z[1]; + + /* store single precision to higher half */ + table[table_size - j][0] = (float) z[0]; + table[table_size - j][1] = (float) -z[1]; + + for (i = 1; i < j; i++) { + /* use double precision for intermediate calculations */ + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + tmp[i + j][0] = zz[0]; + tmp[i + j][1] = zz[1]; + + table[table_size - i - j][0] = (float) zz[0]; + table[table_size - i - j][1] = (float) -zz[1]; + } + } + + /* now generate 1/2 of table in single precision on higher half */ + k = j << 1; + ffts_cexp_32f64f(j, table_size, z); + ffts_cexp_32f64f(k, table_size, x); + + /* store single precision to higher half */ + table[table_size - j][0] = (float) z[0]; + table[table_size - j][1] = (float) -z[1]; + + table[table_size - k][0] = (float) x[0]; + table[table_size - k][1] = (float) -x[1]; + + i = 1; + len = ((table_size + 1) >> 1) - k; + if (len > j) { + len -= j; + + xx[0] = x[0] * z[0] - x[1] * z[1]; + xx[1] = x[1] * z[0] + x[0] * z[1]; + + table[table_size - k - j][0] = (float) xx[0]; + table[table_size - k - j][1] = (float) -xx[1]; + + for (; i < len; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + table[table_size - i - j][0] = (float) zz[0]; + table[table_size - i - j][1] = (float) -zz[1]; + + xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1]; + xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1]; + + table[table_size - i - k][0] = (float) xx[0]; + table[table_size - i - k][1] = (float) -xx[1]; + + xx[0] = x[0] * zz[0] - x[1] * zz[1]; + xx[1] = x[1] * zz[0] + x[0] * zz[1]; + + table[table_size - i - k - j][0] = (float) xx[0]; + table[table_size - i - k - j][1] = (float) -xx[1]; + } + + len = j; + } + + for (; i < len; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + table[table_size - i - j][0] = (float) zz[0]; + table[table_size - i - j][1] = (float) -zz[1]; + + xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1]; + xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1]; + + table[table_size - i - k][0] = (float) xx[0]; + table[table_size - i - k][1] = (float) -xx[1]; + } + + for (; i < j; i++) { + zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1]; + zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1]; + + table[table_size - i - j][0] = (float) zz[0]; + table[table_size - i - j][1] = (float) -zz[1]; + } + + /* duplicate higher half to lower */ + len = table_size >> 1; + for (i = 1; i <= len; i++) { + table[i][0] = table[table_size - i][0]; + table[i][1] = -table[table_size - i][1]; + } + } exit: + /* cos(0) = 1.0, sin(0) = 0.0 */ + table[0][0] = 1.0f; + table[0][1] = 0.0f; return 0; } /* Oscar Buneman's method for generating a sequence of sines and cosines. * Expired US Patent 4,878,187 A -* -* D. Potts, G. Steidl, M. Tasche, Numerical stability of fast -* trigonometric transforms — a worst case study, -* J. Concrete Appl. Math. 1 (2003) 1–36 -* -* O. Buneman, Stable on–line creation of sines and cosines of -* successive angles, Proc. IEEE 75, 1434 – 1435 (1987). */ #if HAVE_SSE2 int @@ -227,10 +741,11 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) { static const __m128d sign_swap = { 0.0, -0.0 }; const __m128d *FFTS_RESTRICT ct; - const double *FFTS_RESTRICT hs; + const ffts_double_t *FFTS_RESTRICT cst; + const ffts_double_t *FFTS_RESTRICT hs; __m128d FFTS_ALIGN(16) w[32]; __m128d FFTS_ALIGN(16) h[32]; - int i, log_2, offset; + int i, log_2, offset, step; /* size must be a power of two */ if (!table || !table_size || (table_size & (table_size - 1))) { @@ -251,21 +766,42 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) goto mid_point; } + cst = (const ffts_double_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_table); + + /* generate small tables from lookup table */ + if (table_size <= 128) { + step = 128 / table_size; + + for (i = 1; i < table_size/2; i++) { + float cosine = (float) cst[4 * i * step + 0].d; + float sine = (float) cst[4 * i * step + 1].d; + + table[i + 0][0] = cosine; + table[i + 0][1] = -sine; + table[table_size - i][0] = sine; + table[table_size - i][1] = -cosine; + } + + goto mid_point; + } + /* calculate table offset */ - FFTS_ASSUME(table_size/2 > 1); + FFTS_ASSUME(table_size/2 > 64); log_2 = ffts_ctzl(table_size); FFTS_ASSUME(log_2 > 1); offset = 32 - log_2; + step = log_2 - 8; ct = (const __m128d*) - FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); - hs = (const double*) &half_secant[4 * offset]; + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]); + hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]); /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { w[i] = ct[2*i]; /* duplicate the high part */ - h[i] = _mm_set1_pd(hs[2*i]); + h[i] = _mm_set1_pd(hs[2*i].d); } /* generate sine and cosine tables with maximum error less than 0.5 ULP */ @@ -279,9 +815,20 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps( _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap))); - /* skip and find next trailing zero */ - offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset])); + /* use lookup table when possible */ + if (log_2 > step) { + offset = ((2 * i) >> step) + (4 << (log_2 - step)); + if (offset >= COS_SIN_TABLE_SIZE) { + offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4; + w[log_2] = _mm_loadr_pd(&cst[offset].d); + } else { + w[log_2] = _mm_load_pd(&cst[offset].d); + } + } else { + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset])); + } } mid_point: @@ -297,11 +844,12 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) { static const __m128d sign_swap = { 0.0, -0.0 }; const struct ffts_dd2_t *FFTS_RESTRICT ct; - const double *FFTS_RESTRICT hs; + const ffts_double_t *FFTS_RESTRICT cst; + const ffts_double_t *FFTS_RESTRICT hs; struct ffts_dd2_t FFTS_ALIGN(16) w[32]; struct ffts_dd2_t FFTS_ALIGN(16) h[32]; struct ffts_dd2_t FFTS_ALIGN(16) sum; - int i, log_2, offset; + int i, log_2, offset, step; /* size must be a power of two */ if (!table || !table_size || (table_size & (table_size - 1))) { @@ -322,22 +870,43 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) goto mid_point; } + cst = (const ffts_double_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_table); + + /* generate small tables from lookup table */ + if (table_size <= 128) { + step = 128 / table_size; + + for (i = 1; i < table_size/2; i++) { + double cosine = cst[4 * i * step + 0].d; + double sine = cst[4 * i * step + 1].d; + + table[i + 0][0] = cosine; + table[i + 0][1] = -sine; + table[table_size - i][0] = sine; + table[table_size - i][1] = -cosine; + } + + goto mid_point; + } + /* calculate table offset */ - FFTS_ASSUME(table_size/2 > 1); + FFTS_ASSUME(table_size/2 > 64); log_2 = ffts_ctzl(table_size); FFTS_ASSUME(log_2 > 1); offset = 32 - log_2; + step = log_2 - 8; ct = (const struct ffts_dd2_t*) - FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); - hs = (const double*) &half_secant[4 * offset]; + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]); + hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]); /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { w[i] = ct[i]; /* duplicate the high and low parts */ - h[i].hi = _mm_set1_pd(hs[2*i + 0]); - h[i].lo = _mm_set1_pd(hs[2*i + 1]); + h[i].hi = _mm_set1_pd(hs[2*i + 0].d); + h[i].lo = _mm_set1_pd(hs[2*i + 1].d); } /* generate sine and cosine tables with maximum error less than 0.5 ULP */ @@ -351,10 +920,23 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) _mm_store_pd((double*) &table[table_size - i], _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap)); - /* skip and find next trailing zero */ - offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]); - w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum); + /* use lookup table when possible */ + if (log_2 > step) { + offset = ((2 * i) >> step) + (4 << (log_2 - step)); + if (offset >= COS_SIN_TABLE_SIZE) { + offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4; + w[log_2].hi = _mm_loadr_pd(&cst[offset + 0].d); + w[log_2].lo = _mm_loadr_pd(&cst[offset + 2].d); + } else { + w[log_2].hi = _mm_load_pd(&cst[offset + 0].d); + w[log_2].lo = _mm_load_pd(&cst[offset + 2].d); + } + } else { + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]); + w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum); + } } mid_point: @@ -369,9 +951,10 @@ int ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) { const ffts_cpx_64f *FFTS_RESTRICT ct; - const double *FFTS_RESTRICT hs; + const ffts_double_t *FFTS_RESTRICT cst; + const ffts_double_t *FFTS_RESTRICT hs; ffts_cpx_64f FFTS_ALIGN(16) w[32]; - int i, log_2, offset; + int i, log_2, offset, step; /* size must be a power of two */ if (!table || !table_size || (table_size & (table_size - 1))) { @@ -392,14 +975,35 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) goto mid_point; } + cst = (const ffts_double_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_table); + + /* generate small tables from lookup table */ + if (table_size <= 128) { + step = 128 / table_size; + + for (i = 1; i < table_size/2; i++) { + float cosine = (float) cst[4 * i * step + 0].d; + float sine = (float) cst[4 * i * step + 1].d; + + table[i + 0][0] = cosine; + table[i + 0][1] = -sine; + table[table_size - i][0] = sine; + table[table_size - i][1] = -cosine; + } + + goto mid_point; + } + /* calculate table offset */ - FFTS_ASSUME(table_size/2 > 1); + FFTS_ASSUME(table_size/2 > 64); log_2 = ffts_ctzl(table_size); FFTS_ASSUME(log_2 > 1); offset = 32 - log_2; + step = log_2 - 8; ct = (const ffts_cpx_64f*) - FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); - hs = (const double*) &half_secant[4 * offset]; + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]); + hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]); /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { @@ -417,10 +1021,23 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) table[table_size - i][0] = (float) w[log_2][1]; table[table_size - i][1] = (float) -w[log_2][0]; - /* skip and find next trailing zero */ - offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]); - w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]); + /* use lookup table when possible */ + if (log_2 > step) { + offset = ((2 * i) >> step) + (4 << (log_2 - step)); + if (offset >= 260) { + offset = 260 - (2 << (log_2 - step)) - 4; + w[log_2][0] = cst[offset + 0].d; + w[log_2][1] = cst[offset + 1].d; + } else { + w[log_2][0] = cst[offset + 0].d; + w[log_2][1] = cst[offset + 1].d; + } + } else { + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]); + w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]); + } } mid_point: @@ -435,9 +1052,10 @@ int ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) { const struct ffts_dd_t *FFTS_RESTRICT ct; + const ffts_double_t *FFTS_RESTRICT cst; const struct ffts_dd_t *FFTS_RESTRICT hs; struct ffts_dd_t FFTS_ALIGN(16) w[32][2]; - int i, log_2, offset; + int i, log_2, offset, step; /* size must be a power of two */ if (!table || !table_size || (table_size & (table_size - 1))) { @@ -458,14 +1076,35 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) goto mid_point; } + cst = (const ffts_double_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_table); + + /* generate small tables from lookup table */ + if (table_size <= 128) { + step = 128 / table_size; + + for (i = 1; i < table_size/2; i++) { + double cosine = cst[4 * i * step + 0].d; + double sine = cst[4 * i * step + 1].d; + + table[i + 0][0] = cosine; + table[i + 0][1] = -sine; + table[table_size - i][0] = sine; + table[table_size - i][1] = -cosine; + } + + goto mid_point; + } + /* calculate table offset */ - FFTS_ASSUME(table_size/2 > 1); + FFTS_ASSUME(table_size/2 > 64); log_2 = ffts_ctzl(table_size); FFTS_ASSUME(log_2 > 1); offset = 32 - log_2; + step = log_2 - 8; ct = (const struct ffts_dd_t*) - FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); - hs = (const struct ffts_dd_t*) &half_secant[4 * offset]; + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]); + hs = (const struct ffts_dd_t*) &half_secant[2 * offset]; /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { @@ -486,12 +1125,29 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) table[table_size - i][0] = w[log_2][1].hi; table[table_size - i][1] = -w[log_2][0].hi; - /* skip and find next trailing zero */ - offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2][0] = ffts_dd_mul_dd(hs[log_2], - ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0])); - w[log_2][1] = ffts_dd_mul_dd(hs[log_2], - ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1])); + /* use lookup table when possible */ + if (log_2 > step) { + offset = ((2 * i) >> step) + (4 << (log_2 - step)); + if (offset >= 260) { + offset = 260 - (2 << (log_2 - step)) - 4; + w[log_2][0].hi = cst[offset + 1].d; + w[log_2][1].hi = cst[offset + 0].d; + w[log_2][0].lo = cst[offset + 3].d; + w[log_2][1].lo = cst[offset + 2].d; + } else { + w[log_2][0].hi = cst[offset + 0].d; + w[log_2][1].hi = cst[offset + 1].d; + w[log_2][0].lo = cst[offset + 2].d; + w[log_2][1].lo = cst[offset + 3].d; + } + } else { + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2][0] = ffts_dd_mul_dd(hs[log_2], + ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0])); + w[log_2][1] = ffts_dd_mul_dd(hs[log_2], + ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1])); + } } mid_point: @@ -509,7 +1165,7 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, int invert) { const ffts_cpx_64f *FFTS_RESTRICT ct; - const double *FFTS_RESTRICT hs; + const ffts_double_t *FFTS_RESTRICT hs; ffts_cpx_64f FFTS_ALIGN(16) w[32]; int i, log_2, offset, N; float *A, *B; @@ -547,8 +1203,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, FFTS_ASSUME(log_2 > 2); offset = 34 - log_2; ct = (const ffts_cpx_64f*) - FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); - hs = (const double*) &half_secant[4 * offset]; + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]); + hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]); /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { @@ -556,7 +1212,6 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, w[i][1] = ct[2*i][1]; } - /* generate sine and cosine tables with maximum error less than 0.5 ULP */ if (sign < 0) { for (i = 1; i < N/4; i++) { float t0, t1, t2; @@ -580,8 +1235,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, /* skip and find next trailing zero */ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]); - w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]); + w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]); + w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]); } } else { for (i = 1; i < N/4; i++) { @@ -606,8 +1261,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, /* skip and find next trailing zero */ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]); - w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]); + w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]); + w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]); } } @@ -625,4 +1280,4 @@ last: } return 0; -} \ No newline at end of file +} diff --git a/lib/ffts/src/ffts_trig.h b/lib/ffts/src/ffts_trig.h index 0b22738..f988340 100644 --- a/lib/ffts/src/ffts_trig.h +++ b/lib/ffts/src/ffts_trig.h @@ -2,7 +2,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South -Copyright (c) 2015, Jukka Ojanen +Copyright (c) 2015-2016, Jukka Ojanen All rights reserved. @@ -39,8 +39,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts_internal.h" +/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */ int -ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size); +ffts_cexp_32f(size_t n, size_t d, float *output); + +int +ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size); + +/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP */ +int +ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size); int ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size); diff --git a/lib/ffts/src/macros-alpha.h b/lib/ffts/src/macros-alpha.h index f7795d4..c32d1e9 100644 --- a/lib/ffts/src/macros-alpha.h +++ b/lib/ffts/src/macros-alpha.h @@ -58,9 +58,6 @@ typedef union { uint32_t u[4]; } V4SF; -#define FFTS_MALLOC(d,a) (malloc(d)) -#define FFTS_FREE(d) (free(d)) - static FFTS_ALWAYS_INLINE V4SF V4SF_LIT4(float f3, float f2, float f1, float f0) { diff --git a/lib/ffts/src/macros-altivec.h b/lib/ffts/src/macros-altivec.h index 28f552f..33f2346 100644 --- a/lib/ffts/src/macros-altivec.h +++ b/lib/ffts/src/macros-altivec.h @@ -4,6 +4,7 @@ Copyright (c) 2013, Michael J. Cree Copyright (c) 2012, 2013, Anthony M. Blake + Copyright (c) 2019, Timothy Pearson All rights reserved. @@ -39,99 +40,89 @@ #define restrict -typedef vector float V; +typedef vector float V4SF; typedef vector unsigned char VUC; -#ifdef __apple__ -#define FFTS_MALLOC(d,a) vec_malloc(d) -#define FFTS_FREE(d) vec_free(d) -#else -/* It appears vec_malloc() and friends are not implemented on Linux */ -#include -#define FFTS_MALLOC(d,a) memalign(16,d) -#define FFTS_FREE(d) free(d) -#endif - -#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3}) +#define V4SF_LIT4(f0,f1,f2,f3) ((V4SF){f0, f1, f2, f3}) -#define VADD(x,y) vec_add(x,y) -#define VSUB(x,y) vec_sub(x,y) -#define VMUL(x,y) vec_madd(x,y,(V){0}) -#define VMULADD(x,y,z) vec_madd(x,y,z) -#define VNMULSUB(x,y,z) vec_nmsub(x,y,z) -#define VXOR(x,y) vec_xor((x),(y)) -#define VSWAPPAIRS(x) \ +#define V4SF_ADD(x,y) vec_add(x,y) +#define V4SF_SUB(x,y) vec_sub(x,y) +#define V4SF_MUL(x,y) vec_madd(x,y,(V4SF){0}) +#define V4SF_MULADD(x,y,z) vec_madd(x,y,z) +#define V4SF_NMULSUB(x,y,z) vec_nmsub(x,y,z) +#define V4SF_XOR(x,y) vec_xor((x),(y)) +#define V4SF_SWAPPAIRS(x) \ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \ 0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b}) -#define VBLEND(x,y) \ +#define V4SF_BLEND(x,y) \ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) -#define VUNPACKHI(x,y) \ +#define V4SF_UNPACK_HI(x,y) \ vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) -#define VUNPACKLO(x,y) \ +#define V4SF_UNPACK_LO(x,y) \ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}) -#define VDUPRE(x) \ +#define V4SF_DUPLICATE_RE(x) \ vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \ 0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b}) -#define VDUPIM(x) \ +#define V4SF_DUPLICATE_IM(x) \ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \ 0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f}) -static inline V IMUL(V d, V re, V im) +static inline V4SF V4SF_IMUL(V4SF d, V4SF re, V4SF im) { - im = VMUL(im, VSWAPPAIRS(d)); - re = VMUL(re, d); - return VSUB(re, im); + im = V4SF_MUL(im, V4SF_SWAPPAIRS(d)); + re = V4SF_MUL(re, d); + return V4SF_SUB(re, im); } -static inline V IMULJ(V d, V re, V im) +static inline V4SF V4SF_IMULJ(V4SF d, V4SF re, V4SF im) { - im = VMUL(im, VSWAPPAIRS(d)); - return VMULADD(re, d, im); + im = V4SF_MUL(im, V4SF_SWAPPAIRS(d)); + return V4SF_MULADD(re, d, im); } #ifndef __GNUC__ /* gcc (4.6 and 4.7) ICEs on this code! */ -static inline V MULI(int inv, V x) +static inline V4SF MULI(int inv, V4SF x) { - return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f)); + return V4SF_XOR(x, inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f)); } #else /* but compiles this fine... */ -static inline V MULI(int inv, V x) +static inline V4SF MULI(int inv, V4SF x) { - V t; - t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f); - return VXOR(x, t); + V4SF t; + t = inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f); + return V4SF_XOR(x, t); } #endif -static inline V IMULI(int inv, V x) +static inline V4SF V4SF_IMULI(int inv, V4SF x) { - return VSWAPPAIRS(MULI(inv, x)); + return V4SF_SWAPPAIRS(MULI(inv, x)); } -static inline V VLD(const void *s) +static inline V4SF V4SF_LD(const void *s) { - V *d = (V *)s; + V4SF *d = (V4SF *)s; return *d; } -static inline void VST(void *d, V s) +static inline void V4SF_ST(void *d, V4SF s) { - V *r = (V *)d; + V4SF *r = (V4SF *)d; *r = s; } #endif diff --git a/lib/ffts/src/macros-neon.h b/lib/ffts/src/macros-neon.h index 29aa49f..f0d1fff 100644 --- a/lib/ffts/src/macros-neon.h +++ b/lib/ffts/src/macros-neon.h @@ -39,9 +39,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#define FFTS_MALLOC(d,a) (valloc(d)) -#define FFTS_FREE(d) (free(d)) - typedef float32x4_t V4SF; typedef float32x4x2_t V4SF2; diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h index 827aa67..46e1f29 100644 --- a/lib/ffts/src/macros-sse.h +++ b/lib/ffts/src/macros-sse.h @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake Copyright (c) 2012, The University of Waikato +Copyright (c) 2018, Jukka Ojanen All rights reserved. @@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) -#define FFTS_FREE(d) (_mm_free(d)) - typedef __m128 V4SF; #define V4SF_ADD _mm_add_ps @@ -56,8 +54,9 @@ typedef __m128 V4SF; #define V4SF_SWAP_PAIRS(x) \ (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1))) +/* note: order is swapped */ #define V4SF_UNPACK_HI(x,y) \ - (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2))) + (_mm_movehl_ps(y, x)) #define V4SF_UNPACK_LO(x,y) \ (_mm_movelh_ps(x, y)) @@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im) return V4SF_ADD(re, im); } +#ifdef FFTS_DOUBLE +typedef union { + struct { + double r1; + double i1; + double r2; + double i2; + } r; + uint32_t u[8]; +} V4DF; + +static FFTS_ALWAYS_INLINE V4DF +V4DF_LIT4(double f3, double f2, double f1, double f0) +{ + V4DF z; + + z.r.r1 = f0; + z.r.i1 = f1; + z.r.r2 = f2; + z.r.i2 = f3; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_ADD(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 + y.r.r1; + z.r.i1 = x.r.i1 + y.r.i1; + z.r.r2 = x.r.r2 + y.r.r2; + z.r.i2 = x.r.i2 + y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_SUB(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 - y.r.r1; + z.r.i1 = x.r.i1 - y.r.i1; + z.r.r2 = x.r.r2 - y.r.r2; + z.r.i2 = x.r.i2 - y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_MUL(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 * y.r.r1; + z.r.i1 = x.r.i1 * y.r.i1; + z.r.r2 = x.r.r2 * y.r.r2; + z.r.i2 = x.r.i2 * y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_XOR(V4DF x, V4DF y) +{ + V4DF z; + + z.u[0] = x.u[0] ^ y.u[0]; + z.u[1] = x.u[1] ^ y.u[1]; + z.u[2] = x.u[2] ^ y.u[2]; + z.u[3] = x.u[3] ^ y.u[3]; + z.u[4] = x.u[4] ^ y.u[4]; + z.u[5] = x.u[5] ^ y.u[5]; + z.u[6] = x.u[6] ^ y.u[6]; + z.u[7] = x.u[7] ^ y.u[7]; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_SWAP_PAIRS(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.i1; + z.r.i1 = x.r.r1; + z.r.r2 = x.r.i2; + z.r.i2 = x.r.r2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_BLEND(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = y.r.r2; + z.r.i2 = y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_UNPACK_HI(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r2; + z.r.i1 = x.r.i2; + z.r.r2 = y.r.r2; + z.r.i2 = y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_UNPACK_LO(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = y.r.r1; + z.r.i2 = y.r.i1; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_DUPLICATE_RE(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.r1; + z.r.r2 = x.r.r2; + z.r.i2 = x.r.r2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_DUPLICATE_IM(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.i1; + z.r.i1 = x.r.i1; + z.r.r2 = x.r.i2; + z.r.i2 = x.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMUL(V4DF d, V4DF re, V4DF im) +{ + re = V4DF_MUL(re, d); + im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d)); + return V4DF_SUB(re, im); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMULJ(V4DF d, V4DF re, V4DF im) +{ + re = V4DF_MUL(re, d); + im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d)); + return V4DF_ADD(re, im); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_MULI(int inv, V4DF x) +{ + V4DF z; + + if (inv) { + z.r.r1 = -x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = -x.r.r2; + z.r.i2 = x.r.i2; + } else { + z.r.r1 = x.r.r1; + z.r.i1 = -x.r.i1; + z.r.r2 = x.r.r2; + z.r.i2 = -x.r.i2; + } + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMULI(int inv, V4DF x) +{ + return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x)); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_LD(const void *s) +{ + V4DF z; + memcpy(&z, s, sizeof(z)); + return z; +} + +static FFTS_ALWAYS_INLINE void +V4DF_ST(void *d, V4DF s) +{ + V4DF *r = (V4DF*) d; + *r = s; +} +#endif + #endif /* FFTS_MACROS_SSE_H */ diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h index e7e349f..99b0c53 100644 --- a/lib/ffts/src/macros.h +++ b/lib/ffts/src/macros.h @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2013, Michael J. Cree Copyright (c) 2012, 2013, Anthony M. Blake +Copyright (c) 2018, Jukka Ojanen All rights reserved. @@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAVE_NEON #include "macros-neon.h" #elif HAVE_SSE +#ifdef HAVE_AVX +#include "macros-avx.h" +#else #include "macros-sse.h" +#endif // NOTE: AltiVec support disabled until updated to provide new V4SF variable type -//#elif __powerpc__ -//#include "macros-altivec.h" +#elif __powerpc__ +#include "macros-altivec.h" #else #include "macros-alpha.h" #endif +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_TX2(V4DF *a, V4DF *b) +{ + V4DF t0 = V4DF_UNPACK_LO(*a, *b); + V4DF t1 = V4DF_UNPACK_HI(*a, *b); + *a = t0; + *b = t1; +} +#endif + static FFTS_INLINE void V4SF_TX2(V4SF *a, V4SF *b) { @@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b) *b = t1; } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_K_N(int inv, + V4DF re, + V4DF im, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF uk, uk2, zk_p, zk_n, zk, zk_d; + + uk = *r0; + uk2 = *r1; + + zk_p = V4DF_IMUL(*r2, re, im); + zk_n = V4DF_IMULJ(*r3, re, im); + + zk = V4DF_ADD(zk_p, zk_n); + zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n)); + + *r2 = V4DF_SUB(uk, zk); + *r0 = V4DF_ADD(uk, zk); + *r3 = V4DF_ADD(uk2, zk_d); + *r1 = V4DF_SUB(uk2, zk_d); +} +#endif + static FFTS_INLINE void V4SF_K_N(int inv, V4SF re, @@ -84,6 +128,45 @@ V4SF_K_N(int inv, *r1 = V4SF_SUB(uk2, zk_d); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_2_4(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + t7 = V4DF_SUB(t2, t3); + + *r0 = V4DF_UNPACK_LO(t4, t5); + *r1 = V4DF_UNPACK_LO(t6, t7); + + t5 = V4DF_IMULI(inv, t5); + + t0 = V4DF_ADD(t6, t4); + t2 = V4DF_SUB(t6, t4); + t1 = V4DF_SUB(t7, t5); + t3 = V4DF_ADD(t7, t5); + + *r3 = V4DF_UNPACK_HI(t0, t1); + *r2 = V4DF_UNPACK_HI(t2, t3); +} +#endif + static FFTS_INLINE void V4SF_L_2_4(int inv, const float *FFTS_RESTRICT i0, @@ -121,6 +204,46 @@ V4SF_L_2_4(int inv, *r2 = V4SF_UNPACK_HI(t2, t3); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_4_4(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + + t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3)); + + t0 = V4DF_ADD(t4, t6); + t2 = V4DF_SUB(t4, t6); + t1 = V4DF_SUB(t5, t7); + t3 = V4DF_ADD(t5, t7); + + V4DF_TX2(&t0, &t1); + V4DF_TX2(&t2, &t3); + + *r0 = t0; + *r2 = t1; + *r1 = t2; + *r3 = t3; +} +#endif + static FFTS_INLINE void V4SF_L_4_4(int inv, const float *FFTS_RESTRICT i0, @@ -159,6 +282,48 @@ V4SF_L_4_4(int inv, *r3 = t3; } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_4_2(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t6 = V4DF_LD(i2); + t7 = V4DF_LD(i3); + + t2 = V4DF_BLEND(t6, t7); + t3 = V4DF_BLEND(t7, t6); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + t7 = V4DF_SUB(t2, t3); + + *r2 = V4DF_UNPACK_HI(t4, t5); + *r3 = V4DF_UNPACK_HI(t6, t7); + + t7 = V4DF_IMULI(inv, t7); + + t0 = V4DF_ADD(t4, t6); + t2 = V4DF_SUB(t4, t6); + t1 = V4DF_SUB(t5, t7); + t3 = V4DF_ADD(t5, t7); + + *r0 = V4DF_UNPACK_LO(t0, t1); + *r1 = V4DF_UNPACK_LO(t2, t3); +} +#endif + static FFTS_INLINE void V4SF_L_4_2(int inv, const float *FFTS_RESTRICT i0, @@ -199,6 +364,9 @@ V4SF_L_4_2(int inv, *r1 = V4SF_UNPACK_LO(t2, t3); } +#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ + V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3); + #define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3); -- cgit v1.2.1