I'm trying to use complex multiply intrinsic to see if it will improve performance of a simple FFT.
Here is the base code without any intrinsics:
void base_fft(float* data, int nn, int isign) { int n, mmax, m, j, istep, i; float wr, wi, tempr, tempi; n = nn << 1; fSintable = (float*)&SINTAB; /* SINTAB defined in std_rtaf.h as volatile int* */ mmax = 2; while (n > mmax) { istep = 2*mmax; for (m = 0; m < mmax; m += 2) { wi = fSintable[((m)/2 * nn/mmax)]; wr = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)]; for (i = m; i < n; i += istep) { j =i + mmax; tempr = wr*data[j] - wi*data[j+1]; tempi = wr*data[j+1] + wi*data[j]; data[j] = data[i] - tempr; data[j+1] = data[i+1] - tempi; data[i] += tempr; data[i+1] += tempi; } } mmax = istep; } }
For a 1024 pt FFT it takes 221us.
Here is the code using intrinsics:
void intrin_fft(float* data, int nn, int isign) { int n, mmax, m, j, istep, i; float wri[2]; double wric,dric,tempri; n = nn << 1; fSintable = (float*)&SINTAB; /* SINTAB defined in std_rtaf.h as volatile int* */ mmax = 2; while (n > mmax) { istep = 2*mmax; for (m = 0; m < mmax; m += 2) { wri[0] = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)]; wri[1] = fSintable[((m)/2 * nn/mmax)]; wric = _amemd8((void*)&wri[0]); for (i = m; i < n; i += istep) { j =i + mmax; dric = _amemd8((void*)&data[j]); tempri = _complex_mpysp(wric,dric); data[j] = data[i] - (-_hif(tempri)); data[j+1] = data[i+1] - (_lof(tempri)); data[i] += -_hif(tempri); data[i+1] += _lof(tempri); } } mmax = istep; } }
This takes 223us.
I would like to know where I'm going wrong. I expected the intrinsics to give a boost.
Thanks,
Arun