Hello
I have tried to write a simple function that adds with saturation two vectors and saves the result in the 3rd vector (because the current version in C wasn't fast enough). So far the attached code performs at ~0.5 cycles/element. I have a feeling it could be faster. Could you suggest something? I have compiled it with -mh24 and -O3 and have no further ideas. The DSP is c6400.
.sect ".text:_func"
.global _func
_func: .cproc A4, B4, A6, B6
.reg vec_a, vec_b, vec_out, count, ah:al, bh:bl, ch:cl
.no_mdep
mv A4, vec_a
mv B4, vec_b
mv A6, count
mv B6, vec_out
shr count, 3, count
loop: .trip 2, 0x20000000, 2
lddw *vec_a++, ah:al
lddw *vec_b++, bh:bl
saddu4 ah, bh, ch
saddu4 al, bl, cl
stdw ch:cl, *vec_out++
[count] sub count, 1, count
[count] b loop
.return
.endproc
Here is the C++ equivalent. But it runs only ~4cycles/element, which is really slow. Something's definitely wrong here. Could you suggest any modifications here as well?
inline void IntrinsicSADD
(
const uint8 * restrict inVector1,
const uint8 * restrict inVector2,
const int inLength,
uint8 * restrict outVector
)
{
for(int i = 0; i < inLength; ++i)
{
uint32 out_hi = _saddu4( _hi( _amemd8_const( &inVector1[i] ) ), _hi( _amemd8_const( &inVector2[i] ) ) );
uint32 out_lo = _saddu4( _lo( _amemd8_const( &inVector1[i] ) ), _lo( _amemd8_const( &inVector2[i] ) ) );
_amemd8( &outVector[i] ) = _itod( out_hi, out_lo ); \
}
}