Last active
September 2, 2019 10:07
-
-
Save iccir/b92d60822d2259c6720c6a8fc5fdbf52 to your computer and use it in GitHub Desktop.
Corrected example of http://hamiltonkibbe.com/finite-impulse-response-filters-using-apples-accelerate-framework-part-iii/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <math.h> | |
#include <Accelerate/Accelerate.h> | |
/* filter signal x with filter h and store the result in output. | |
* output must be at least as long as x | |
*/ | |
void | |
fft_fir_filter( const float *x, | |
unsigned x_length, | |
const float *h, | |
unsigned h_length, | |
float *output) | |
{ | |
// The length of the result from linear convolution is one less than the | |
// sum of the lengths of the two inputs. | |
unsigned result_length = x_length + h_length - 1; | |
unsigned overlap_length = result_length - x_length; | |
// Create buffer to store overflow across calls. | |
// | |
// iccir: This assumes that h_length doesn't change across calls. | |
// The overflow buffer would usually be passed in rather | |
// than being a static. | |
static float *overflow = NULL; | |
if (!overflow) { | |
overflow = malloc(sizeof(float) * (h_length - 1)); | |
float zero = 0.0; | |
vDSP_vfill(&zero, overflow, 1, h_length - 1); | |
} | |
// You need to implement next_pow2, it should return the first power of 2 | |
// greater than the argument passed to it. | |
unsigned fft_length = next_pow2(result_length); | |
// Create buffers to hold the zero-padded signal, filter kernel, and result. | |
float h_padded[fft_length]; | |
float x_padded[fft_length]; | |
float temp_result[fft_length]; | |
// fill padded buffers with zeros | |
float zero = 0.0; | |
vDSP_vfill(&zero, h_padded, 1, fft_length); | |
vDSP_vfill(&zero, x_padded, 1, fft_length); | |
// Copy inputs into padded buffers | |
cblas_scopy(h_length, h, 1, h_padded, 1); | |
cblas_scopy(x_length, x, 1, x_padded, 1); | |
// The Accelerate FFT needs an initialized setup structure. This, like much | |
// of the above setup routine should be done outside of this function. I am | |
// putting it here for ease of demonstration. This only needs to happen once. | |
// | |
// iccir: A balancing call to vDSP_destroy_fftsetup() is needed to prevent a | |
// memory leak. | |
FFTSetup setup = vDSP_create_fftsetup(log2f((float)fft_length), FFT_RADIX2); | |
// Create complex buffers for holding the Fourier Transforms of h and x | |
// DSPSplitComplex holds pointers to two arrays of values, real, and imaginary. | |
// Each array should hold at least fft_length/2 samples. | |
DSPSplitComplex h_DFT; | |
DSPSplitComplex x_DFT; | |
// Create and assign the backing storage structures for each of these buffers. | |
// In your actual implementation, these should be allocated elsewhere and | |
// passed to this function along with the FFTSetup from above. | |
float h_real[fft_length/2]; | |
float h_imag[fft_length/2]; | |
h_DFT.realp = h_real; | |
h_DFT.imagp = h_imag; | |
float x_real[fft_length/2]; | |
float x_imag[fft_length/2]; | |
x_DFT.realp = x_real; | |
x_DFT.imagp = x_imag; | |
// Here we calculate the FFT of the filter kernel. This should be done once | |
// when you initialize your filter. As I mentioned previously, much of | |
// this setup routine should be done outside of this function and saved. | |
// Convert the real-valued filter kernel to split-complex form | |
// and store it in our DFT array. | |
vDSP_ctoz((DSPComplex*)h_padded, 2, &h_DFT, 1, (fft_length/2)); | |
// Do in-place FFT of filter kernel | |
vDSP_fft_zrip(setup, &h_DFT, 1, log2f(fft_length), FFT_FORWARD); | |
// Calculate FFT of input signal... | |
// Convert the real-valued signal to split-complex form | |
// and store it in our DFT array. | |
vDSP_ctoz((DSPComplex*)x_padded, 2, &x_DFT, 1, (fft_length/2)); | |
// Do in-place FFT of the input signal | |
vDSP_fft_zrip(setup, &x_DFT, 1, log2f(fft_length), FFT_FORWARD); | |
// This gets a bit strange. The vDSP FFT stores the real value at nyquist in the | |
// first element in the imaginary array. The first imaginary element is always | |
// zero, so no information is lost by doing this. The only issue is that we are | |
// going to use a complex vector multiply function from vDSP and it doesn't | |
// handle this format very well. We calculate this multiplication ourselves and | |
// add it into our result later. | |
// We'll need this later | |
float nyquist_multiplied = h_DFT.imagp[0] * x_DFT.imagp[0]; | |
// Set the values in the arrays to 0 so they don't affect the multiplication | |
h_DFT.imagp[0] = 0.0; | |
x_DFT.imagp[0] = 0.0; | |
// Complex multiply x_DFT by h_DFT and store the result in x_DFT | |
vDSP_zvmul(&x_DFT, 1, &h_DFT, 1, &x_DFT,1, fft_length/2, 1); | |
// Now we put our hand-calculated nyquist value back | |
x_DFT.imagp[0] = nyquist_multiplied; | |
// Do the inverse FFT of our result | |
vDSP_fft_zrip(setup, &x_DFT, 1, log2f(fft_length), FFT_INVERSE); | |
// And convert split-complex format to real-valued | |
vDSP_ztoc(&x_DFT, 1, (DSPComplex*)temp_result, 2, fft_length/2); | |
// Now we have to scale our result by 1/(4*fft_length) | |
// (This is Apple's convention) to get our correct result. | |
float scale = (1.0 / (4.0 * fft_length) ); | |
vDSP_vsmul(temp_result, 1, &scale, temp_result, 1, fft_length); | |
// The rest of our overlap handling is the same as in our previous | |
// implementation | |
// Add the overlap from the previous run | |
// use vDSP_vadd instead of loop | |
vDSP_vadd(temp_result, 1, overflow, 1, temp_result, 1, overlap_length); | |
// Copy overlap into overlap buffer | |
cblas_scopy(overlap_length, temp_result + x_length, 1, overflow, 1); | |
// write the final result to the output. use BLAS copy instead of loop | |
cblas_scopy(x_length, temp_result, 1, output, 1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment