Hi all,
Just looking for some advice on writing good code to be pipelined.
I have to following function which performs iir filtering on 4 channels of data, the temp array holds the output for the 4 channels, ten_HZ_high_pass array holds the filter kernel and last_hp_chn arrays hold the current sample, 2 previous samples and 2 previous filtered samples.
#pragma MUST_ITERATE(6,6,6); #pragma UNROLL(2); for(i = 0; i < 6; i++) { temp[0] += (ten_HZ_high_pass[i] * last_hp_ch1[i]); temp[1] += (ten_HZ_high_pass[i] * last_hp_ch2[i]); temp[2] += (ten_HZ_high_pass[i] * last_hp_ch3[i]); temp[3] += (ten_HZ_high_pass[i] * last_hp_ch4[i]); }
at the moment this is taking 828 cpu cycles, this seems a bit high to me? is it?
here is my assembly output for the loop
;*----------------------------------------------------------------------------* ;* SOFTWARE PIPELINE INFORMATION ;* ;* Loop found in file : ../filters.c ;* Loop source line : 351 ;* Loop opening brace source line : 352 ;* Loop closing brace source line : 361 ;* Loop Unroll Multiple : 2x ;* Known Minimum Trip Count : 3 ;* Known Maximum Trip Count : 3 ;* Known Max Trip Count Factor : 3 ;* Loop Carried Dependency Bound(^) : 9 ;* Unpartitioned Resource Bound : 5 ;* Partitioned Resource Bound(*) : 5 ;* Resource Partition: ;* A-side B-side ;* .L units 0 0 ;* .S units 0 0 ;* .D units 5* 4 ;* .M units 3 5* ;* .X cross paths 5* 3 ;* .T address paths 5* 4 ;* Long read paths 0 0 ;* Long write paths 0 0 ;* Logical ops (.LS) 4 4 (.L or .S unit) ;* Addition ops (.LSD) 3 2 (.L or .S or .D unit) ;* Bound(.L .S .LS) 2 2 ;* Bound(.L .S .D .LS .LSD) 4 4 ;* ;* Searching for software pipeline schedule at ... ;* ii = 9 Schedule found with 3 iterations in parallel ;* Done ;* ;* Loop will be splooped ;* Collapsed epilog stages : 0 ;* Collapsed prolog stages : 0 ;* Minimum required memory pad : 0 bytes ;* ;* Minimum safe trip count : 1 (after unrolling) ;*----------------------------------------------------------------------------* $C$L55: ; PIPED LOOP PROLOG .dwpsn file "../filters.c",line 351,column 0,is_stmt SPLOOPD 9 ;27 ; (P) || MVC .S2 B5,ILC || STW .D2T2 B20,*+DP(_start_time+4) ; |346| ;** --------------------------------------------------------------------------* $C$L56: ; PIPED LOOP KERNEL $C$DW$L$_FLT_High_pass_filter$3$B: .dwpsn file "../filters.c",line 352,column 0,is_stmt SPMASK L2 || MV .L2 B22,B16 || LDDW .D2T2 *B9++,B19:B18 ; |353| (P) <0,0> SPMASK L1 || MV .L1 A18,A5 || LDW .D2T2 *B16++(8),B17 ; |353| (P) <0,1> SPMASK L1 || MV .L1X B21,A8 || LDW .D1T1 *A5++(8),A16 ; |355| (P) <0,2> SPMASK L1 || MV .L1 A20,A9 || LDW .D1T1 *A8++(8),A16 ; |357| (P) <0,3> SPMASK L2 || ADD .L2 4,B22,B7 || LDW .D1T1 *A9++(8),A16 ; |359| (P) <0,4> SPMASK L2 || ADD .L2X 4,A18,B8 || LDW .D2T2 *B7++(8),B4 ; |353| (P) <0,5> SPMASK L1 || ADD .L1X 4,B21,A6 || MPYSP .M2 B18,B17,B17 ; |353| (P) <0,6> || LDW .D2T2 *B8++(8),B4 ; |355| (P) <0,6> SPMASK L1 || ADD .L1 4,A20,A7 || MPYSP .M2X B18,A16,B18 ; |355| (P) <0,7> || LDW .D1T1 *A6++(8),A16 ; |357| (P) <0,7> SPMASK L1 || ZERO .L1 A4 ; |321| || MPYSP .M1X B18,A16,A17 ; |357| (P) <0,8> || LDW .D1T1 *A7++(8),A4 ; |359| (P) <0,8> MPYSP .M2X B18,A16,B4 ; |359| (P) <0,9> SPMASK L2 || ZERO .L2 B5 ; |321| || MV .S1X B4,A16 ; |353| (P) <0,10> Define a twin register ADDSP .L2 B18,B5,B4 ; |355| (P) <0,11> ^ || MPYSP .M1X B19,A16,A4 ; |353| (P) <0,11> || MPYSP .M2 B19,B4,B5 ; |355| (P) <0,11> SPMASK S1,L2 || ZERO .L2 B6 ; |321| || ZERO .S1 A3 ; |321| || ADDSP .L1 A17,A4,A17 ; |357| (P) <0,12> ^ || MPYSP .M1X B19,A16,A3 ; |357| (P) <0,12> ADDSP .L1X B17,A3,A3 ; |353| (P) <0,13> ^ || MPYSP .M2X B19,A4,B4 ; |359| (P) <0,13> || ADDSP .L2 B4,B6,B5 ; |359| (P) <0,13> ^ NOP 1 ADDSP .L2 B5,B4,B5 ; |355| (P) <0,15> ^ ADDSP .L1 A3,A17,A4 ; |357| (P) <0,16> ^ ADDSP .L1 A4,A3,A3 ; |353| (P) <0,17> ^ || ADDSP .L2 B4,B5,B6 ; |359| (P) <0,17> ^ .dwpsn file "../filters.c",line 361,column 0,is_stmt SPKERNEL 1,1 $C$DW$L$_FLT_High_pass_filter$3$E: ;** --------------------------------------------------------------------------* $C$L57: ; PIPED LOOP EPILOG NOP 2 MVC .S2 TSCL,B7 ; |363| || MV .L1X B5,A5 ;** --------------------------------------------------------------------------*
I can see in this that it is only using LDW and not LDDW, but I cannot see why it can't use LDDW, floats are 32 bit?? and the registers are 32 bits so why can't it load the two consecutive elements of an array into two consecutive registers.