Hi,
I'm using CCS v4.2.4, I wrote a short test code as linear assembly in a file test_pipe.sa
which simply implement : for (i=0; i<N; i++) p_u[i] = p_u[i] + p_v[i]
_test: .cproc p_u, N, p_v
.reg j, u, ref_u, break_flag
.no_mdep
MV N, j
loop: .trip 16
[j] SUB j, 1, j
LDH *+p_u[j], u
LDH *+p_v[j], ref_u
ADD u, ref_u, u
STH u, *+p_u[j]
[j] B loop
.endproc
After compiled with –debug_software_pipeline, the compiler tells me the Loop Carried Dependency Bound(^) : 0
Then after a slight modified of the code,
_test: .cproc p_u, N, p_v
.reg j, u, ref_u, break_flag
.no_mdep
MV N, j
loop: .trip 16
[j] SUB j, 1, j
LDH *+p_u[j], u
LDH *+p_v[j], ref_u
ADD u, ref_u, u
STH u, *+p_u[j]
ZERO break_flag
[!j] MVK 1, break_flag
[!break_flag] B loop
.endproc
The compiler tells me Loop Carried Dependency Bound(^) : 3
Which doesn't quite make any sense, can somebody tells me where the dependency is,
The attached file is the asm file produced by compiler with the second test code
;******************************************************************************
;* TMS320C6x C/C++ Codegen PC v7.2.3 *
;* Date/Time created: Sun Apr 15 15:16:40 2012 *
;******************************************************************************
.compiler_opts --abi=coffabi --c64p_l1d_workaround=default --endian=little --hll_source=linasm --long_precision_bits=40 --mem_model:code=near --mem_model:const=data --mem_model:data=far_aggregates --object_format=coff --silicon_version=6400 --symdebug:none
;******************************************************************************
;* GLOBAL FILE PARAMETERS *
;* *
;* Architecture : TMS320C64xx *
;* Optimization : Enabled at level 3 *
;* Optimizing for : Speed *
;* Based on options: -o3, no -ms *
;* Endian : Little *
;* Interrupt Thrshld : Disabled *
;* Data Access Model : Far Aggregate Data *
;* Pipelining : Enabled *
;* Speculate Loads : Disabled *
;* Memory Aliases : Presume not aliases (optimistic) *
;* Debug Info : No Debug Info *
;* *
;******************************************************************************
.asg A15, FP
.asg B14, DP
.asg B15, SP
.global $bss
.sect ".text"
.clink
;******************************************************************************
;* FUNCTION NAME: test *
;* *
;* Regs Modified : A0,A1,A2,A3,A4,A5,A6,B0,B1,B2,B4,B5,B6,B7,B8,B9,B16 *
;* Regs Used : A0,A1,A2,A3,A4,A5,A6,B0,B1,B2,B3,B4,B5,B6,B7,B8,B9, *
;* DP,SP,B16 *
;******************************************************************************
_test:
.map N/B4
.map N'/A0
.map ref_u/B6
.map break_flag/B0
.map j/A1
.map j$1/A0
.map j$2/B8
.map j$3/B9
.map j$4/B7
.map j$5/A2
.map j$6/A6
.map j$7/B4
.map u/A4
.map u'/A6
.map p_u/A3
.map p_u'/B5
.map p_u''/A4
.map p_v/A5
.map p_v'/A6
;** --------------------------------------------------------------------------*
; EXCLUSIVE CPU CYCLES: 7
;
; _test: .cproc p_u, N, p_v
; .reg j, u, ref_u, break_flag
; .no_mdep
; loop: .trip 16
MV .L1X N,N' ; |2|
MV .L1X N,j ; |2|
MV .L1X N,j$5 ; |2|
[ j$1] ADD .L1X 0xffffffff,N,j ; |7|
|| ZERO .L2 break_flag ; |13| (P) <0,2>
|| MVC .S2 CSR,B16
|| MV .S1 p_v',p_v ; |2|
MV .L1 j,j$1 ; |7|
|| LDH .D1T2 *+p_v[j],ref_u ; |10| (P) <0,4>
|| MVK .L2 0x1,B1
|| MV .S1 p_u'',p_u ; |2|
|| MV .S2X p_u'',p_u' ; |2|
|| MV .D2 N,j$2 ; |2|
[ j$1] ADD .L1 0xffffffff,j,j ; |7| (P) <1,1> ^
|| AND .L2 -2,B16,B4
|| LDH .D1T1 *+p_u[j],u ; |9| (P) <0,3>
|| MV .S1 j,j$5 ; |7|
|| [!j] MVK .S2 0x1,break_flag ; |15| (P) <0,3>
|| [ j$5] ADD .D2 0xffffffff,N,j$2 ; |7|
MV .L1 j,j$6 ; |7| (P) <1,2> ^ Split a long life(pre-sched)
|| ZERO .L2 break_flag ; |13| (P) <1,2>
|| MVC .S2 B4,CSR ; interrupts off
|| [ break_flag] ZERO .D2 B1 ; |15| (P) <0,5>
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop source line : 7
;* Loop closing brace source line : 16
;* Known Minimum Trip Count : 16
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 3
;* Unpartitioned Resource Bound : 3
;* Partitioned Resource Bound(*) : 3
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 1 0
;* .D units 2 1
;* .M units 0 0
;* .X cross paths 1 0
;* .T address paths 2 1
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 6 8 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 0
;* Bound(.L .S .D .LS .LSD) 3* 3*
;*
;* Searching for software pipeline schedule at ...
;* ii = 3 Schedule found with 5 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: |** *** |*** **** |
;* 1: |****** |*** ***** |
;* 2: |** **** |*** ****** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Collapsed epilog stages : 4
;* Prolog not removed
;* Collapsed prolog stages : 0
;*
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh14
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 3
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 11 + trip_cnt * 3
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MVK 0x1,B1
;* MV A1,B8
;* MV A3,B5
;* MV B1,B2
;* MV A1,A0
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C29:
;* 0 NOP 1
;* 1 [ A0] ADD .S1 0xffffffff,A1,A1 ; |7| ^
;* 2 MV .L1 A0,A2 ; |7| Split a long life(pre-sched)
;* || MV .D1 A1,A0 ; |7| ^ Split a long life(pre-sched)
;* || ZERO .S2 B0 ; |13|
;* 3 [ A2] ADD .S2 0xffffffff,B8,B8 ; |7| Define a twin register
;* || [ B1] LDH .D1T1 *+A3[A1],A4 ; |9|
;* || [!A1] MVK .D2 0x1,B0 ; |15|
;* 4 [ B1] LDH .D1T2 *+A5[A1],B6 ; |10|
;* 5 [ B0] ZERO .L2 B1 ; |15|
;* 6 MV .L2 B8,B9 ; |7| Split a long life(pre-sched)
;* 7 MV .S2 B9,B7 ; |7| Split a long life(pre-sched)
;* 8 MV .D2 B1,B4 ; |15| Split a long life(pre-sched)
;* || [ B1] B .S1 $C$C29 ; |16|
;* 9 ADD .L1X A4,B6,A6 ; |11|
;* 10 [ B2] STH .D2T1 A6,*+B5[B7] ; |12|
;* || MV .L2 B4,B2 ; |15| Split a long life(pre-sched)
;* 11 NOP 3
;* 14 ; BRANCHCC OCCURS {$C$C29} ; |16|
;*----------------------------------------------------------------------------*
$C$L1: ; PIPED LOOP PROLOG
; EXCLUSIVE CPU CYCLES: 5
MV .L2 j$2,j$7 ; |7| (P) <0,6> Split a long life(pre-sched)
|| [ j$5] ADD .S2 0xffffffff,j$2,j$2 ; |7| (P) <1,3> Define a twin register
|| [ B1] LDH .D1T1 *+p_u[j],u ; |9| (P) <1,3>
|| [!j] MVK .D2 0x1,break_flag ; |15| (P) <1,3>
|| MV .L1 j$6,j$1 ; |7| (P) <2,0> ^ Copy to predicate register
MVK .L2 0x1,B2
|| MV .S2 j$7,j$4 ; |7| (P) <0,7> Split a long life(pre-sched)
|| [ B1] LDH .D1T2 *+p_v[j],ref_u ; |10| (P) <1,4>
|| MV .L1 j$1,j$6 ; |7| (P) <2,1> Split a long life(pre-sched)
|| [ j$1] ADD .S1 0xffffffff,j,j ; |7| (P) <2,1> ^
[ B1] B .S1 $C$L2 ; |16| (P) <0,8>
|| MV .L2 B1,B4 ; |15| (P) <0,8> Split a long life(pre-sched)
|| [ break_flag] ZERO .S2 B1 ; |15| (P) <1,5>
|| ZERO .D2 break_flag ; |13| (P) <2,2>
|| MV .L1 j$6,j$5 ; |7| (P) <2,2> Split a long life(pre-sched)
|| MV .D1 j,j$6 ; |7| (P) <2,2> ^ Split a long life(pre-sched)
ADD .L1X u,ref_u,u' ; |11| (P) <0,9>
|| MV .L2 j$2,j$3 ; |7| (P) <1,6> Split a long life(pre-sched)
|| [ j$5] ADD .S2 0xffffffff,j$2,j$2 ; |7| (P) <2,3> Define a twin register
|| [ B1] LDH .D1T1 *+p_u[j],u ; |9| (P) <2,3>
|| [!j] MVK .D2 0x1,break_flag ; |15| (P) <2,3>
|| MV .S1 j$6,j$1 ; |7| (P) <3,0> ^ Copy to predicate register
MV .L2 B4,B2 ; |15| (P) <0,10> Split a long life(pre-sched)
|| [ B2] STH .D2T1 u',*+p_u'[j$4] ; |12| (P) <0,10>
|| MV .S2 j$3,j$4 ; |7| (P) <1,7> Split a long life(pre-sched)
|| [ B1] LDH .D1T2 *+p_v[j],ref_u ; |10| (P) <2,4>
|| [ j$1] ADD .S1 0xffffffff,j,j ; |7| (P) <3,1> ^
;** --------------------------------------------------------------------------*
$C$L2: ; PIPED LOOP KERNEL
; EXCLUSIVE CPU CYCLES: 3
[ B1] B .S1 $C$L2 ; |16| <1,8>
|| MV .D2 B1,B4 ; |15| <1,8> Split a long life(pre-sched)
|| [ break_flag] ZERO .L2 B1 ; |15| <2,5>
|| ZERO .S2 break_flag ; |13| <3,2>
|| MV .L1 j$1,j$5 ; |7| <3,2> Split a long life(pre-sched)
|| MV .D1 j,j$1 ; |7| <3,2> ^ Split a long life(pre-sched)
ADD .L1X u,ref_u,u' ; |11| <1,9>
|| MV .L2 j$2,j$3 ; |7| <2,6> Split a long life(pre-sched)
|| [ j$5] ADD .S2 0xffffffff,j$2,j$2 ; |7| <3,3> Define a twin register
|| [ B1] LDH .D1T1 *+p_u[j],u ; |9| <3,3>
|| [!j] MVK .D2 0x1,break_flag ; |15| <3,3>
MV .L2 B4,B2 ; |15| <1,10> Split a long life(pre-sched)
|| [ B2] STH .D2T1 u',*+p_u'[j$4] ; |12| <1,10>
|| MV .S2 j$3,j$4 ; |7| <2,7> Split a long life(pre-sched)
|| [ B1] LDH .D1T2 *+p_v[j],ref_u ; |10| <3,4>
|| [ j$1] ADD .S1 0xffffffff,j,j ; |7| <4,1> ^
;** --------------------------------------------------------------------------*
$C$L3: ; PIPED LOOP EPILOG
;** --------------------------------------------------------------------------*
; EXCLUSIVE CPU CYCLES: 6
RETNOP .S2 B3,4 ; |18|
MVC .S2 B16,CSR ; interrupts on
; BRANCH OCCURS {B3} ; |18|
.clearmap
; .endproc
;******************************************************************************
;* BUILD ATTRIBUTES *
;******************************************************************************
.battr "TI", Tag_File, 1, Tag_ABI_stack_align_needed(0)
.battr "TI", Tag_File, 1, Tag_ABI_stack_align_preserved(0)
Thanks
Shi Tianqi