Hello everyone !
I want to calculate FFT 8 complex points. I think it can achieve more effectively than DSPF_sp_fftSPxSP in DSPLIB.
Because W(1,8) and W(3,8) have same factor 0.707107. W(0,8) = 1, W(2, 8) = -j. Stage 1 and stage 2 don't need any multiplications.
Here is my C code.
#define R2D2 -0.707107 void FFT_8_Point(float * restrict x, float * restrict y) { float s00, s01, s10, s11, s20, s21, s30, s31, s40, s41, s50, s51, s60, s61, s70, s71; float sn00, sn01, sn10, sn11, sn20, sn21, sn30, sn31, sn40, sn41, sn50, sn51, sn60, sn61, sn70, sn71; float t50, t51, t70, t71; // state 1 //s0 = x[0] + x4; s00 = x[0] + x[8]; s10 = x[0] - x[8]; //s1 = x[0] - x4; s01 = x[1] + x[9]; s11 = x[1] - x[9]; //s4 = x[1] + x5; s40 = x[2] + x[10]; s50 = x[2] - x[10]; //s5 = x[1] - x5; s41 = x[3] + x[11]; s51 = x[3] - x[11]; //s2 = x[2] + x6; s20 = x[4] + x[12]; s30 = x[4] - x[12]; //s3 = x[2] - x6; s21 = x[5] + x[13]; s31 = x[5] - x[13]; //s6 = x[3] + x[7]; s60 = x[6] + x[14]; s70 = x[6] - x[14]; //s7 = x[3] - x7; s61 = x[7] + x[15]; s71 = x[7] - x[15]; // stage 2 sn00 = s00 + s20; sn20 = s00 - s20; sn01 = s01 + s21; sn21 = s01 - s21; sn10 = s10 + s31; sn30 = s10 - s31; sn11 = s11 - s30; sn31 = s11 + s30; sn40 = s40 + s60; sn60 = s40 - s60; sn41 = s41 + s61; sn61 = s41 - s61; sn50 = s50 + s71; sn70 = s50 - s71; sn51 = s51 - s70; sn71 = s51 + s70; // state 3 //t5 = w[1] * s5; t50 = -R2D2 *(sn50 + sn51); t51 = R2D2*(sn50 - sn51); //t7 = w[3] * s7; t70 = R2D2 *(sn70 - sn71); t71 = R2D2*(sn70 + sn71); //y[0] = s0 + t4; y[0] = sn00 + sn40; y[1] = sn01 + sn41; //y[1] = s1 + t5; y[2] = sn10 + t50; y[3] = sn11 + t51; //y[2] = s2 + t6; y[4] = sn20 + sn61; y[5] = sn21 - sn60; //y[3] = s3 + t7; y[6] = sn30 + t70; y[7] = sn31 + t71; //y[4] = s0 - t4; y[8] = sn00 - sn40; y[9] = sn01 - sn41; //y[5] = s1 - t5; y[10] = sn10 - t50; y[11] = sn11 - t51; //y[6] = s2 - t6; y[12] = sn20 - sn61;y[13] = sn21 + sn60; //y[7] = s3 - t7; y[14] = sn30 - t70; y[15] = sn31 - t71; }
Stage 1: 16 add. Stage 2 : 16 add. Stage 3: 20 add, 4 mult
Performance : my code is 135 cycles, DSPF_sp_fftSPxSP is 146.
I am only familiar with loop unroll, my FFT_8_Point is linear code. Can anyone suggest me some ideal to optimize my code ?
Thanks !