Hello everybody,
I'm having trouple engaging the parallel MAC unit on my c5517 DSP. Here is the C code of my fast multiplication (it's used in a IIR filter)
**************************************************************************************************************************************************
inline void uber_mult(onchip Q15 *ca, onchip Q15 *cb, onchip Q15 *x, t_Audio * restrict s, t_Audio * restrict r){
onchip Q15 *cal = ca + 1;
onchip Q15 *cbl = cb + 1;
onchip Q15 *xl = x + 1;
int40_t p1a = _smacsui(0, *(ca), *xl);
int40_t p1b = _smacsui(0, *(cb), *xl);
int40_t p2a = _llsmacsui(p1a, *x, *(cal));
int40_t p2b = _llsmacsui(p1b, *x, *(cbl));
p1a = _llsshl(p2a, -15);
p1b = _llsshl(p2b, -15);
*r = _smac(p1a, *ca, *x);
*s = _smac(p1b, *cb, *x);
}
********************************************************************************************************************************
And here is the assembler generated by the optimizer: (from the corresponding .asm file)
********************************************************************************************************************************
$C$DW$L$_alg_fpIIR_perform$4$B:
;** 560 ----------------------- p1a = _smacsui(0L, *(onchip int *)ca, (unsigned)*((int *)data+1)); // [7]
;** 561 ----------------------- p1b = _smacsui(0L, *(onchip int *)cb, (unsigned)*((int *)data+1)); // [7]
;** 563 ----------------------- p2a = _llsmacsui(p1a, *(int *)data, (unsigned)*((onchip int *)ca+1)); // [7]
;** 564 ----------------------- p2b = _llsmacsui(p1b, *(int *)data, (unsigned)*((onchip int *)cb+1)); // [7]
;** 172 ----------------------- y = _lsadd(y, _smac((long)_llsshl(p2b, (-15)), *(onchip int *)cb, *(int *)data));
;** 173 ----------------------- y$60 = _lsadd(k, _smac((long)_llsshl(p2a, (-15)), *(onchip int *)ca, *(int *)data));
;** 173 ----------------------- k = y$60;
;** 174 ----------------------- if ( (--data) >= delay_base ) goto g6;
;** 174 ----------------------- data += 11;
;** -----------------------g6:
;** 170 ----------------------- ++cb;
;** 170 ----------------------- ++ca;
;** 165 ----------------------- if ( --L$2 != -1 ) goto g12;
MOV *AR4(short(#1)) << #16, AC0
BCLR ST1_FRCT
|| MOV *AR4(short(#1)) << #16, AC2
SFTL AC2, #0, AC2 ; |561|
|| MOV #0, AC1 ; |560|
AMAR *AR4, XCDP
|| SFTL AC0, #0, AC0 ; |560|
MOV #0, AC0 ; |561|
|| MACM *AR2, AC0, AC1 ; |560|
MACM *AR3, AC2, AC0 ; |561|
MOV *AR2(short(#1)) << #16, AC2
BSET ST1_M40
|| SFTL AC2, #0, AC2 ; |563|
MACM *AR4, AC2, AC1 ; |563|
MOV *AR3(short(#1)) << #16, AC2
|| BCLR ST1_M40
BSET ST1_M40
|| SFTL AC2, #0, AC2 ; |564|
MACM *AR4, AC2, AC0 ; |564|
CMPU AR4 >= T2, TC1 ; |174|
|| ASUB #2, AR4 ; |174|
SFTS AC0, #-15, AC0 ; |172|
|| BSET ST1_FRCT
XCC !TC1 ||
AADD #22, AR4 ; |174|
BCLR ST1_M40
|| SFTS AC1, #-15, AC2 ; |173|
MAC *AR3, *CDP, AC0 :: MAC *AR2, *CDP, AC2
ADD dbl(*SP(#4)), AC0, AC0 ; |172|
MOV AC0, dbl(*SP(#4)) ; |172|
ADD dbl(*SP(#0)), AC2, AC0 ; |173|
|| AADD #2, AR3 ; |170|
.dwpsn file "../src/DSP/decimation_filter.c",line 176,column 0,is_stmt
MOV AC0, dbl(*SP(#0)) ; |173|
|| AADD #2, AR2 ; |170|
**********************************************************************************************************************************************************************
As you can see, there is only one MAC :: MAC operation (which most likely corresponds to the two consecutive _smac in the C code, I suppose.
The rest doesn't look very well optimized as far as MAC units are concerned. There are some MACM operations that are followed by a MOV, but there is no "::" in between, so they are not considered a MACM::MOV parallelized operation.
Even then, moving data back to memory (MOV) doesn't seem the best thing to do for speed, I would have preferred that the whole function was completed using only the registers.
Does anybody understand what I'm doing wrong?
Thank you very much for your attention.