This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

FFT implementation using C6678 intrinsics

Other Parts Discussed in Thread: BIOSLINUXMCSDK

I'm trying to use complex multiply intrinsic to see if it will improve performance of a simple FFT.

Here is the base code without any intrinsics:

void base_fft(float* data, int nn, int isign)
{
	
	int n, mmax, m, j, istep, i;
	float wr, wi, tempr, tempi; 
	n = nn << 1;   	
	fSintable = (float*)&SINTAB;  /* SINTAB defined in std_rtaf.h as volatile int* */   	
	mmax = 2;   	
	while (n > mmax) 	
	{      	
		istep = 2*mmax;      	
		for (m = 0; m < mmax; m += 2) 		
		{			
			wi = fSintable[((m)/2 * nn/mmax)];			
			wr = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)];         	
			for (i = m; i < n; i += istep) 			
			{            	
				j =i + mmax;            	
				tempr = wr*data[j] - wi*data[j+1];            	
				tempi = wr*data[j+1] + wi*data[j];            	
				data[j] = data[i] - tempr;            	
				data[j+1] = data[i+1] - tempi;            	
				data[i] += tempr;            	
				data[i+1] += tempi;         	
			}      	
		}     	
	mmax = istep;   	
	}
}

For a 1024 pt FFT it takes 221us.

Here is the code using intrinsics:

void intrin_fft(float* data, int nn, int isign) 
{
	int n, mmax, m, j, istep, i;
	float wri[2];
	double wric,dric,tempri;
	n = nn << 1;
	fSintable = (float*)&SINTAB;  
	/* SINTAB defined in std_rtaf.h as volatile int* */
	mmax = 2;   	
	while (n > mmax) 	
	{     	
		istep = 2*mmax;    	
		for (m = 0; m < mmax; m += 2)      	
		{    		
			wri[0] = isign * fSintable[nn/4 + ((m)/2 * nn/mmax)];
			wri[1] = fSintable[((m)/2 * nn/mmax)];     		
			wric = _amemd8((void*)&wri[0]);     		
			for (i = m; i < n; i += istep)      		
			{
				j =i + mmax;     			
				dric = _amemd8((void*)&data[j]);     			
				tempri = _complex_mpysp(wric,dric);     		    
				data[j] = data[i] - (-_hif(tempri));     	   		
				data[j+1] = data[i+1] - (_lof(tempri));     	    	
				data[i] += -_hif(tempri);     		    
				data[i+1] += _lof(tempri);     		
			}     	
		}
     		mmax = istep;	
	}
}
This takes 223us. 
I would like to know where I'm going wrong. I expected the intrinsics to give a boost.
Thanks,
Arun