I have the following loop working on restricted pointers, where source & dest are both in L1DSRAM & rows * cols is a multiple of 16. When I compile with optimization, it attempts to unroll this loop 6x (i thought I had already unrolled it =} ) and upon simulation with the evmdm6437 simulator, I get a huge number of write buffer full stalls. So I have a few questions:
1) Does L1D use the write buffer to write to itself?
2) If so, why is the compiler doing such a big unroll if the bandwidth is impossible to realize? If not, why is the simulator using it?
3) How do I get the compiler to do the "appropriate" level of optimization for this loop?
for ( i = 0; i < (NUM_ROWS * NUM_COLS) >> 4; i++ )
{
src_f_e_d_c_b_a_9_8 = _amem8( source );
source += 8;
src_f_e_d_c = _hill( src_f_e_d_c_b_a_9_8 );
dst_f_e_d_c = _xpnd4( _cmpgtu4( src_f_e_d_c, thresh_reg ) );
src_b_a_9_8 = _loll( src_f_e_d_c_b_a_9_8 );
dst_b_a_9_8 = _xpnd4( _cmpgtu4( src_b_a_9_8, thresh_reg ) );
dst_f_e_d_c_b_a_9_8 = _itoll( dst_f_e_d_c, dst_b_a_9_8 );
_amem8( dest ) = dst_f_e_d_c_b_a_9_8;
dest += 8;
src_7_6_5_4_3_2_1_0 = _amem8( source );
source += 8;
src_7_6_5_4 = _hill( src_7_6_5_4_3_2_1_0 );
dst_7_6_5_4 = _xpnd4( _cmpgtu4( src_7_6_5_4, thresh_reg ) );
src_3_2_1_0 = _loll( src_7_6_5_4_3_2_1_0 );
dst_3_2_1_0 = _xpnd4( _cmpgtu4( src_3_2_1_0, thresh_reg ) );
dst_7_6_5_4_3_2_1_0 = _itoll( dst_7_6_5_4, dst_3_2_1_0 );
_amem8( dest ) = dst_7_6_5_4_3_2_1_0;
dest += 8;
}