Hi,
I am developing for the C6678 using CGT-4.7.8 and currently experience suboptimal results for a software-pipelined loop.
The code is part of a rather large function and looks like:
int16_t* restrict posPtr = ...
uint16_t* restrict dstPtr = ...
int i;
for (i = 0; i < ....; i++) {
dstPtr[posPtr[i] + i] = 25;
}
The generated software pipelined loop is unrolled two times, however it includes two LDW instructions loading relative to the stack pointer. As soon as I rmeove the red "+ i"-part, the LDWs go away.
As far as I can see, the values loaded aren't even used at all.
Any insight what is going on here would be highly appriciated. Please find the resulting code at the end of the post.
I also wonder where the loop carried dependency bound comes from, as I am only using restrict pointers.
Thank you in advance, Clemens
Listing of the generated code, with the "+ i", the mysterious LDW is highlighted:
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c
;* Loop source line : 232
;* Loop opening brace source line : 233
;* Loop closing brace source line : 243
;* Loop Unroll Multiple : 2x
;* Known Minimum Trip Count : 120
;* Known Maximum Trip Count : 120
;* Known Max Trip Count Factor : 120
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 3
;* Partitioned Resource Bound(*) : 3
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 3* 3*
;* .M units 0 0
;* .X cross paths 0 1
;* .T address paths 3* 1
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 2 2 (.L or .S or .D unit)
;* Bound(.L .S .LS) 0 0
;* Bound(.L .S .D .LS .LSD) 2 2
;*
;* Searching for software pipeline schedule at ...
;* ii = 3 Schedule found with 4 iterations in parallel
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh28
;*
;* Minimum safe trip count : 1 (after unrolling)
;*----------------------------------------------------------------------------*
$C$L20: ; PIPED LOOP PROLOG
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 232,column 0,is_stmt,isa 0
SPLOOPD 3 ;12 ; (P)
|| MV .L2 B4,B6
|| MVC .S2 B6,ILC
;** --------------------------------------------------------------------------*
$C$L21: ; PIPED LOOP KERNEL
$C$DW$L$PfeMinIdxOfColParab_16u_C1R_k$13$B:
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 233,column 0,is_stmt,isa 0
LDH .D2T2 *B6++(4),B5 ; |238| (P) <0,0>
SPMASK L1,D2
|| LDW .D2T1 *+SP(576),A15
|| ADD .L1X 2,B4,A4
LDH .D1T1 *A4++(4),A3 ; |238| (P) <0,2>
NOP 1
SPMASK L1,L2,D2
|| ZERO .L2 B4
|| MV .L1 A26,A5 ; |225|
|| LDW .D2T1 *+SP(604),A8
ADDAH .D2 B4,B5,B5 ; |238| (P) <0,5>
SPMASK L1,S1
|| ZERO .L1 A6
|| MVK .S1 0x19,A7
|| ADD .L2X A5,B5,B5 ; |238| (P) <0,6>
ADDAH .D1 A6,A3,A3 ; |238| (P) <0,7>
|| ADD .L2 4,B4,B4 ; |232| (P) <0,7>
|| STH .D2T1 A7,*B5 ; |238| (P) <0,7> ^
ADD .L1 A5,A3,A3 ; |238| (P) <0,8>
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 243,column 0,is_stmt,isa 0
SPKERNEL 0,0
|| ADD .L1 4,A6,A6 ; |232| <0,9> Define a twin register
|| STH .D1T1 A7,*+A3(2) ; |238| <0,9> ^
Listing of the generated code, without the "+ i":
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c
;* Loop source line : 232
;* Loop opening brace source line : 233
;* Loop closing brace source line : 243
;* Loop Unroll Multiple : 2x
;* Known Minimum Trip Count : 120
;* Known Maximum Trip Count : 120
;* Known Max Trip Count Factor : 120
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 2
;* Partitioned Resource Bound(*) : 2
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 2* 2*
;* .M units 0 0
;* .X cross paths 0 0
;* .T address paths 2* 2*
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 0 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 0 0
;* Bound(.L .S .D .LS .LSD) 1 1
;*
;* Searching for software pipeline schedule at ...
;* ii = 2 Schedule found with 4 iterations in parallel
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh28
;*
;* Minimum safe trip count : 1 (after unrolling)
;*----------------------------------------------------------------------------*
$C$L20: ; PIPED LOOP PROLOG
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 232,column 0,is_stmt,isa 0
SPLOOPD 2 ;8 ; (P)
|| LDW .D2T1 *+SP(600),A7
|| MV .L2 B4,B6
|| MVC .S2 B5,ILC
;** --------------------------------------------------------------------------*
$C$L21: ; PIPED LOOP KERNEL
$C$DW$L$PfeMinIdxOfColParab_16u_C1R_k$13$B:
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 233,column 0,is_stmt,isa 0
SPMASK L1
|| ADD .L1X 2,B4,A4
|| LDH .D2T2 *B6++(4),B7 ; |238| (P) <0,0>
LDH .D1T1 *A4++(4),A3 ; |238| (P) <0,1>
NOP 1
SPMASK S2
|| MVK .S2 0x19,B5
SPMASK L2
|| MV .L2X A26,B4 ; |225|
SPMASK L1,S1
|| MV .L1 A26,A5 ; |225|
|| MV .S1X B5,A6
|| STH .D2T2 B5,*+B4[B7] ; |238| (P) <0,5> ^
.dwpsn file "C:/Users/EissererC/work_pfelib/imgStereo/PfeMinIdxOfColParab_16u_C1R/src_opt/PfeMinIdxOfColParab_16u_C1R_k_TMS320C6600.c",line 243,column 0,is_stmt,isa 0
SPKERNEL 0,0
|| STH .D1T1 A6,*+A5[A3] ; |238| <0,6> ^