I'm trying to use complex multiply intrinsic to see if it will improve performance of a simple FFT.
Here is the base code without any intrinsics:
void base_fft(float* data, int nn, int isign)
{
int n, mmax, m, j, istep, i;
float wr, wi, tempr, tempi;
n = nn << 1;
fSintable = (float*)&SINTAB; /* SINTAB defined in std_rtaf.h as volatile int* */
mmax = 2;
while (n > mmax)
{
istep = 2*mmax;
for (m = 0; m < mmax; m += 2)
{
wi = fSintable[((m)/2 * nn/mmax)];
wr = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)];
for (i = m; i < n; i += istep)
{
j =i + mmax;
tempr = wr*data[j] - wi*data[j+1];
tempi = wr*data[j+1] + wi*data[j];
data[j] = data[i] - tempr;
data[j+1] = data[i+1] - tempi;
data[i] += tempr;
data[i+1] += tempi;
}
}
mmax = istep;
}
}
For a 1024 pt FFT it takes 221us.
Here is the code using intrinsics:
void intrin_fft(float* data, int nn, int isign)
{
int n, mmax, m, j, istep, i;
float wri[2];
double wric,dric,tempri;
n = nn << 1;
fSintable = (float*)&SINTAB;
/* SINTAB defined in std_rtaf.h as volatile int* */
mmax = 2;
while (n > mmax)
{
istep = 2*mmax;
for (m = 0; m < mmax; m += 2)
{
wri[0] = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)];
wri[1] = fSintable[((m)/2 * nn/mmax)];
wric = _amemd8((void*)&wri[0]);
for (i = m; i < n; i += istep)
{
j =i + mmax;
dric = _amemd8((void*)&data[j]);
tempri = _complex_mpysp(wric,dric);
data[j] = data[i] - (-_hif(tempri));
data[j+1] = data[i+1] - (_lof(tempri));
data[i] += -_hif(tempri);
data[i+1] += _lof(tempri);
}
}
mmax = istep;
}
}
This takes 223us.
I would like to know where I'm going wrong. I expected the intrinsics to give a boost.
Thanks,
Arun