Hi,
This is the computationally intensive part of my code, does anyone have any suggestions on how to optimize it?
delay_idx = (start_pulse_idx-BUFSIZE-delay[0]+max_shift-N_taps)%LONG_BUFSIZE;
// Reset delay_idx
for( idx_samp = 0; idx_samp < BUFSIZE ; idx_samp++) ///BUFSIZE-N_taps
{
y_ll[0] = _dinthsp(p_data[0][idx_samp]);
// Set tap_delay_idx
tap_delay_idx = delay_idx;
// Clear y
y_est_ll[0] = 0;
for(idx_tap = 0; idx_tap < 12; idx_tap++) //N_taps*2
{
// Calculate y_est
y_est_ll[0] = _daddsp(y_est_ll[0],_complex_mpysp(x_ll_shared_data[0][tap_delay_idx],h_est_ll[idx_tap]));
tap_delay_idx++;
tap_delay_idx = ( tap_delay_idx >= LONG_BUFSIZE ) ? LONG_BUFSIZE - tap_delay_idx : tap_delay_idx;
}
delay_idx++;
delay_idx = ( delay_idx >= LONG_BUFSIZE ) ? LONG_BUFSIZE - delay_idx : delay_idx;
// Write output values
x_ll_data[0][idx_samp] = _dsubsp(y_ll[0],y_est_ll[0]);
}
where the *_ll are __float2_t types, p_data is a volatile Uint32*, LONG_BUFSIZE = 10*BUFSIZE, BUFSIZE = 15360, the iterators are global Uint32s,