Hello everyone
I plan to do the dot production for multiple vectors, so I modify the example function DSPF_sp_dotp_cplx. The following is the modified function. I want know that if I use MUST_ITERATE(48,,48) correctly, or not. I mean will the Outside FOR loop be unrolled for 48n times; Will the inside FOR loop be unrolled at the same time? If I want to make the inside FOR loop unrolled, how can I do? I am using Compiler Version 8.0.4, and Opt Level is set at 3.
Thanks
Xining Yu
void DSPF_sp_dotp_cplx_new(const float * x, const float * y, unsigned int nx, unsigned int ny,
float * restrict re, float * restrict im)
{
unsigned int i, j;
__float2_t x0_im_re, y0_im_re, result0 = 0;
__float2_t x1_im_re, y1_im_re, result1 = 0;
__float2_t x2_im_re, y2_im_re, result2 = 0;
__float2_t x3_im_re, y3_im_re, result3 = 0;
__float2_t result;
_nassert(nx % 4 == 0);
_nassert(nx > 0);
_nassert((int)x % 8 == 0);
_nassert((int)y % 8 == 0);
#pragma MUST_ITERATE(48,,48);
for (j = 0; j < nx; j += 48)
{
for(i = 0; i < 2 * ny; i += 8)
{
/* load 4 sets of input data */
x0_im_re = _amem8_f2((void*)&x[i+j]);
y0_im_re = _amem8_f2((void*)&y[i]);
x1_im_re = _amem8_f2((void*)&x[i+2+j]);
y1_im_re = _amem8_f2((void*)&y[i+2]);
x2_im_re = _amem8_f2((void*)&x[i+4+j]);
y2_im_re = _amem8_f2((void*)&y[i+4]);
x3_im_re = _amem8_f2((void*)&x[i+6+j]);
y3_im_re = _amem8_f2((void*)&y[i+6]);
/* calculate 4 running sums */
result0 = _daddsp(_complex_mpysp(x0_im_re, y0_im_re), result0);
result1 = _daddsp(_complex_mpysp(x1_im_re, y1_im_re), result1);
result2 = _daddsp(_complex_mpysp(x2_im_re, y2_im_re), result2);
result3 = _daddsp(_complex_mpysp(x3_im_re, y3_im_re), result3);
}
result = _daddsp(_daddsp(result0,result1),_daddsp(result2,result3));
result0 = 0; result1 = 0; result2 = 0; result3 = 0;
*re = -_hif2(result);
*im = _lof2(result);
re += 2;
im += 2;
}
}