optimising pipelined loop

Graham Long

Hi all,

Just looking for some advice on writing good code to be pipelined.

I have to following function which performs iir filtering on 4 channels of data, the temp array holds the output for the 4 channels, ten_HZ_high_pass array holds the filter kernel and last_hp_chn arrays hold the current sample, 2 previous samples and 2 previous filtered samples.

#pragma MUST_ITERATE(6,6,6);
#pragma UNROLL(2);
for(i = 0; i < 6; i++)
{
	temp[0] += (ten_HZ_high_pass[i] * last_hp_ch1[i]);
	temp[1] += (ten_HZ_high_pass[i] * last_hp_ch2[i]);
	temp[2] += (ten_HZ_high_pass[i] * last_hp_ch3[i]);
	temp[3] += (ten_HZ_high_pass[i] * last_hp_ch4[i]);
}

at the moment this is taking 828 cpu cycles, this seems a bit high to me? is it?

here is my assembly output for the loop

;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Loop found in file               : ../filters.c
;*      Loop source line                 : 351
;*      Loop opening brace source line   : 352
;*      Loop closing brace source line   : 361
;*      Loop Unroll Multiple             : 2x
;*      Known Minimum Trip Count         : 3                    
;*      Known Maximum Trip Count         : 3                    
;*      Known Max Trip Count Factor      : 3
;*      Loop Carried Dependency Bound(^) : 9
;*      Unpartitioned Resource Bound     : 5
;*      Partitioned Resource Bound(*)    : 5
;*      Resource Partition:
;*                                A-side   B-side
;*      .L units                     0        0     
;*      .S units                     0        0     
;*      .D units                     5*       4     
;*      .M units                     3        5*    
;*      .X cross paths               5*       3     
;*      .T address paths             5*       4     
;*      Long read paths              0        0     
;*      Long write paths             0        0     
;*      Logical  ops (.LS)           4        4     (.L or .S unit)
;*      Addition ops (.LSD)          3        2     (.L or .S or .D unit)
;*      Bound(.L .S .LS)             2        2     
;*      Bound(.L .S .D .LS .LSD)     4        4     
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 9  Schedule found with 3 iterations in parallel
;*      Done
;*
;*      Loop will be splooped
;*      Collapsed epilog stages       : 0
;*      Collapsed prolog stages       : 0
;*      Minimum required memory pad   : 0 bytes
;*
;*      Minimum safe trip count       : 1 (after unrolling)
;*----------------------------------------------------------------------------*
$C$L55:    ; PIPED LOOP PROLOG
	.dwpsn	file "../filters.c",line 351,column 0,is_stmt

           SPLOOPD 9       ;27               ; (P) 
||         MVC     .S2     B5,ILC
||         STW     .D2T2   B20,*+DP(_start_time+4) ; |346| 

;** --------------------------------------------------------------------------*
$C$L56:    ; PIPED LOOP KERNEL
$C$DW$L$_FLT_High_pass_filter$3$B:
	.dwpsn	file "../filters.c",line 352,column 0,is_stmt

           SPMASK          L2
||         MV      .L2     B22,B16
||         LDDW    .D2T2   *B9++,B19:B18     ; |353| (P) <0,0> 

           SPMASK          L1
||         MV      .L1     A18,A5
||         LDW     .D2T2   *B16++(8),B17     ; |353| (P) <0,1> 

           SPMASK          L1
||         MV      .L1X    B21,A8
||         LDW     .D1T1   *A5++(8),A16      ; |355| (P) <0,2> 

           SPMASK          L1
||         MV      .L1     A20,A9
||         LDW     .D1T1   *A8++(8),A16      ; |357| (P) <0,3> 

           SPMASK          L2
||         ADD     .L2     4,B22,B7
||         LDW     .D1T1   *A9++(8),A16      ; |359| (P) <0,4> 

           SPMASK          L2
||         ADD     .L2X    4,A18,B8
||         LDW     .D2T2   *B7++(8),B4       ; |353| (P) <0,5> 

           SPMASK          L1
||         ADD     .L1X    4,B21,A6
||         MPYSP   .M2     B18,B17,B17       ; |353| (P) <0,6> 
||         LDW     .D2T2   *B8++(8),B4       ; |355| (P) <0,6> 

           SPMASK          L1
||         ADD     .L1     4,A20,A7
||         MPYSP   .M2X    B18,A16,B18       ; |355| (P) <0,7> 
||         LDW     .D1T1   *A6++(8),A16      ; |357| (P) <0,7> 

           SPMASK          L1
||         ZERO    .L1     A4                ; |321| 
||         MPYSP   .M1X    B18,A16,A17       ; |357| (P) <0,8> 
||         LDW     .D1T1   *A7++(8),A4       ; |359| (P) <0,8> 

           MPYSP   .M2X    B18,A16,B4        ; |359| (P) <0,9> 

           SPMASK          L2
||         ZERO    .L2     B5                ; |321| 
||         MV      .S1X    B4,A16            ; |353| (P) <0,10> Define a twin register

           ADDSP   .L2     B18,B5,B4         ; |355| (P) <0,11>  ^ 
||         MPYSP   .M1X    B19,A16,A4        ; |353| (P) <0,11> 
||         MPYSP   .M2     B19,B4,B5         ; |355| (P) <0,11> 

           SPMASK          S1,L2
||         ZERO    .L2     B6                ; |321| 
||         ZERO    .S1     A3                ; |321| 
||         ADDSP   .L1     A17,A4,A17        ; |357| (P) <0,12>  ^ 
||         MPYSP   .M1X    B19,A16,A3        ; |357| (P) <0,12> 

           ADDSP   .L1X    B17,A3,A3         ; |353| (P) <0,13>  ^ 
||         MPYSP   .M2X    B19,A4,B4         ; |359| (P) <0,13> 
||         ADDSP   .L2     B4,B6,B5          ; |359| (P) <0,13>  ^ 

           NOP             1
           ADDSP   .L2     B5,B4,B5          ; |355| (P) <0,15>  ^ 
           ADDSP   .L1     A3,A17,A4         ; |357| (P) <0,16>  ^ 

           ADDSP   .L1     A4,A3,A3          ; |353| (P) <0,17>  ^ 
||         ADDSP   .L2     B4,B5,B6          ; |359| (P) <0,17>  ^ 

	.dwpsn	file "../filters.c",line 361,column 0,is_stmt
           SPKERNEL 1,1
$C$DW$L$_FLT_High_pass_filter$3$E:
;** --------------------------------------------------------------------------*
$C$L57:    ; PIPED LOOP EPILOG
           NOP             2

           MVC     .S2     TSCL,B7           ; |363| 
||         MV      .L1X    B5,A5

;** --------------------------------------------------------------------------*

I can see in this that it is only using LDW and not LDDW, but I cannot see why it can't use LDDW, floats are 32 bit?? and the registers are 32 bits so why can't it load the two consecutive elements of an array into two consecutive registers.

over 10 years ago

0 Graham Long over 10 years ago

Intellectual 440 points

is this an issue because I am using floats? I changed to signed int's to see what happened and the loop carried dependency bound changed from 9 to 0 and it started using LDDW and STDW.

0 Sivaraj Kuppuraj over 10 years ago

TI__Mastermind 35645 points

Hi,

Thanks for your post.

In general, floating point arithmetic operations (divisions, multiplications etc). would require manual optimization by hand since the floating point division operation would disable the software pipe-lining option due to the "/" operator being considered a function call, there by, compiler will not optimize the code efficiently. I think, loops coded with SPLOOP mechanism would have several advantages as below:

1. SPLOOPs are interruptible

2. SPLOOPs can be coded to a smaller size

3. SPLOOPs normally operates from a small SPLOOP buffer and not from regular memory

4. SPLOOP code is more readable since epilog and prolog are not explicitly coded.

Using SPLOOP buffer, you can schedule multiple iterations of the loop in parallel thereby, it wouldn't require for the programmer to seperately code for

prolog, loop kernel, and epilog.

There are compiler optimization tutorial available in chapter 3 from spru198k doc as below:

http://www.ti.com/lit/ug/spru198k/spru198k.pdf

Also to know explicitly on SPLOOP buffer hardware and software mechanisms, please refer chapter 7 for the detailed description on the same from spru732j doc as below:

http://www.ti.com.cn/cn/lit/ug/spru732j/spru732j.pdf

Thanks & regards,

Sivaraj K

-------------------------------------------------------------------------------------------------------

Please click the Verify Answer button on this post if it answers your question.

-------------------------------------------------------------------------------------------------------

0 RandyP over 10 years ago in reply to Sivaraj Kuppuraj

TI__Guru* 84110 points

Graham,

It might help us to know which device this is targeted for.

SIvaraj picked up on the fact that your data types are floats. There may be other interesting information missing from your source snippet, such as array alignment.

As you should find from the documents Sivaraj recommended, you can use intrinsics like _amem8() to force wider reads when you know the alignment and locations will work out for you. The compiler might need more information, such as _nassert() to help it understand alignments.

Regards,
RandyP

0 Graham Long over 10 years ago in reply to RandyP

Intellectual 440 points

Randy,

the device is the C6748.

I have tried using _nassert(((int)&array_name % 8) == 0) with no change in the timings, I believe arrays are aligned to 64 bit addresses on this device.

I will look into _amem8() and read the documentation that Slvaraj pointed to, hopefully that will help.

Graham

0 Andy Polyakov over 10 years ago

Expert 1340 points

This is all confusing. For starters, why is equal sign drawn between LDDW and optimal performance? For example if algorithm works best by isolating data and A and B halves of register bank, then it might be more efficient to load data directly to A and B and avoid cross-references. I'm not saying that this is such case, only that lack of LDDW is not necessarily sign of bad performance. What is critical for performance is iteration interval observed after SPLOOP[D] instruction. It's 9 in presented snippet. And the question should be if this is optimal, for this case that is (there is no universal rule to calculate one, it all depends). Because if it's optimal, then there is nothing one can do, with or without LDDW. Well, this holds absolutely true for long loops, as for short ones there might be ways to cut corners... Again, not saying that this is such case. In other words it's hard to tell if 9 is right in this case, but *if* loop was long, then given amount of required resources 2 would be optimal if loop wasn't unrolled. Latter means that optimal coefficient would have to be multiplied by unroll factor if loop is unrolled, e.g. it would be 4 for loop unrolled twice.

Secondly, story doesn't seem to match. It's asserted that loop is executed in 828 cycles, but disassembler listing suggests that it takes ~40 (assuming that it's from actually i<6). Even if data is not in cache, it still can't possibly go up to 800. It sounds more like unoptimized code was measured, not disassembled one. Or something else was measured, not loop in question...

0 RandyP over 10 years ago in reply to Andy Polyakov

TI__Guru* 84110 points

Graham,

With small snippets and pseudo code, it is unlikely we will be able to give you any direct help. But the documentation that Siviraj pointed you to should help a lot if you follow the correct syntax. You can also look for the C6000 Optimization Workshop that is archived on the TI Wiki Pages; search for "c6000 optimization" (no quotes) and find it in the list. Some very helpful information is there.

Graham Long said:
_nassert(((int)&array_name % 8) == 0)

This is pseudo code since there is nothing named array_name in your code snippet. If the meaning is correct, then the '&' symbol is incorrect here and will not tell the compiler what you want it to know. As you suggest, the alignment may be 64-bit already, but it never hurts to tell the compiler true information.

Andy,

It always seems to me that you are not at all confused but have a very good understanding of what is happening and what is needed for optimization and other issues with these processors and tools. Thank you for your continued sharing to all our benefit.

Asking 'what is optimal?' is a great point to make.

It will be interesting to hear the explanation about the high cycle count. This is another example of how snippets can be disconnected from full performance. Your insights are always worded well.

Regards,
RandyP

0 Graham Long over 10 years ago in reply to Andy Polyakov

Intellectual 440 points

Hi Andy,

Thanks for the reply, I draw a link between LDDW and optimal performance from something that was said in an optimisation workshop video which I found somewhere on the ti wiki, not sure what the link was but it was done by Eric Wilbur and Scott Specker.

from the assembly code it looks like the A and B registers are not isolated as you have some cross adds and multiplies e.g.

           ADDSP   .L1X    B17,A3,A3         ; |353| (P) <0,13>  ^ 

||         MPYSP   .M2X    B19,A4,B4         ; |359| (P) <0,13>

for measuring the loop time I used the following code, if I build the code as un-optimised I get approximately 4860 CPU cycles :(

start_time = _itoll(TSCH, TSCL);


#pragma MUST_ITERATE(6,6,6);
#pragma UNROLL(2);
for(i = 0; i < 6; i++)
{
	temp[0] += (ten_HZ_high_pass[i] * last_hp_ch1[i]);
	temp[1] += (ten_HZ_high_pass[i] * last_hp_ch2[i]);
	temp[2] += (ten_HZ_high_pass[i] * last_hp_ch3[i]);
	temp[3] += (ten_HZ_high_pass[i] * last_hp_ch4[i]);
}

end_time = _itoll(TSCH, TSCL);

cycle_count[cycle_index] = end_time - start_time - overhead;

cycle_index++;

if(cycle_index == 100)
{
	cycle_index = 0;
}

0 Graham Long over 10 years ago in reply to RandyP

Intellectual 440 points

RandyP said:

_nassert(((int)&array_name % 8) == 0)

This is pseudo code since there is nothing named array_name in your code snippet. If the meaning is correct, then the '&' symbol is incorrect here and will not tell the compiler what you want it to know. As you suggest, the alignment may be 64-bit already, but it never hurts to tell the compiler true information.

Hi Randy, the following was just to avoid re-writing the same code again with all 6 of the arrays used in the loop, I should have made it clear that I was just being lazy and substituting array_name for temp, ten_HZ_high_pass, last_hp_ch1, last_hp_ch2, last_hp_ch3 and last_hp_ch4.

Regards,

Graham

0 Andy Polyakov over 10 years ago in reply to RandyP

Expert 1340 points

Randy,

You can't tell me how I feel :-) I mean if I say I'm confused, then you can't tell me that I'm not. Well, what exactly I am confused about can also be matter of confusion. Which seems to be the case. You seem to imply that I'm not at all confused about processor inner workings. But this is correct, I'm not, nor have I ever said that I am confused about that. It's other things, namely a) equality sign between LDDW and performance, and b) fact that pieces of the story don't seem to match each other. As for a). It's actually not original question that is confusing, but rather the fact that TI didn't draw OP's attentions to other vital factor, such as SPLOOP iteration interval. Note that I never said that LDDW doesn't matter for performance, only that it's not the only factor. And I implied that under circumstances maybe it's not most important one. Indeed, if interval chosen by compiler is 9, when optimal should be 2*unroll factor, then one should first ask what prevents compiler from using lower interval... [Or show that generated code delivers performance not much worse than 2*iterations+epilogue.]

Graham,

As for A and B not being isolated in your specific case. I did write "I'm not saying that this is such case." I mentioned isolation to make more general point, the above one, that one should probably refocus question on iteration interval. Not to mention that it might turn out that once you manage to convince compiler to use lower interval, it might start isolating data in A and B. Point is that there might be questions that needs to be answered first, before other questions become meaningful to pose, at least in my opinion.

As for cycles counter. Looking at cycles counter for non-optimized code makes virtually no sense. Question is If you can actually confirm that originally mentioned 800 cycles are in fact for optimized code presented in disassembler window, so that you can actually see that TSC is accessed right before loop, as well as after, we see only latter. You have to recognize that compiler doesn't have to keep exact order of expressions as they appear in C source code, and can reorder operations so that something else is measured. If you can actually confirm all this, then key question would be how come the count is so high. As already mentioned I personally have hard time imagining the reason, but TI might have an idea... Now you also say that you collect multiple data point in an array, but you mention single result. Which one is it? Maximum? Minimum? Average?

In either case keep in mind that you can't count on community member such as myself to actually solve your problems. I might be able to help, or might not...

0 user347219 over 10 years ago in reply to Graham Long

Expert 1240 points

Graham Long said:

RandyP

_nassert(((int)&array_name % 8) == 0)

This is pseudo code since there is nothing named array_name in your code snippet. If the meaning is correct, then the '&' symbol is incorrect here and will not tell the compiler what you want it to know. As you suggest, the alignment may be 64-bit already, but it never hurts to tell the compiler true information.

Hi Randy, the following was just to avoid re-writing the same code again with all 6 of the arrays used in the loop, I should have made it clear that I was just being lazy and substituting array_name for temp, ten_HZ_high_pass, last_hp_ch1, last_hp_ch2, last_hp_ch3 and last_hp_ch4.

In case you missed it, Randy also pointed out that you have an extraneous & in that _nassert.

Also judging from your output you don't appear to have an aliasing problem, but restrict is another important piece of information for the compiler to optimize.

So your code might look like:

#include <assert.h>

void func(float const * restrict ten_HZ_high_pass, float const * restrict last_hp_ch1, float const * restrict last_hp_ch2, float const * restrict last_hp_ch3, float const * restrict last_hp_ch4, float * restrict temp)
{
    _nassert((int) ten_HZ_high_pass % 8 == 0);
    _nassert((int) last_hp_ch1 % 8 == 0);
    _nassert((int) last_hp_ch2 % 8 == 0);
    _nassert((int) last_hp_ch3 % 8 == 0);
    _nassert((int) last_hp_ch4 % 8 == 0);
    _nassert((int) temp % 8 == 0);
    
    int i;
    
    #pragma MUST_ITERATE(6, 6, 6)
    for(i = 0; i < 6; i++)
    {
        temp[0] += (ten_HZ_high_pass[i] * last_hp_ch1[i]);
        temp[1] += (ten_HZ_high_pass[i] * last_hp_ch2[i]);
        temp[2] += (ten_HZ_high_pass[i] * last_hp_ch3[i]);
        temp[3] += (ten_HZ_high_pass[i] * last_hp_ch4[i]);
    }  
}

Experiment with UNROLL as you like.

Processors

Processors forum

optimising pipelined loop