Hello,
I wrote a "Multiply all complex numbers in a vector" method for my C67x:
#define RE( x ) ( _lof( x ))
#define IM( x ) ( _hif( x ))
/** <!-------------------------------------------------------------------------------------------->
* Calculates the complex product of the numbers in the vector
*
* @param axI (i) input vector with alternating re,im,re,im,...
* has to be dword aligned.
* @param iSize (i) size of axI in <em>floats</rm>
* @return product
<!--------------------------------------------------------------------------------------------> */
A::Complex A::ProductCmplx( const float* restrict axI, const int iSize )
{
int i;
__float2_t cTemp;
Complex cResult = { 1, 0 }; // just a stuct of floats
float fTmpRe;
std::_nassert((int)axI % 8 == 0);
std::_nassert(iSize % 2 == 0);
std::_nassert(iSize > 0);
#pragma MUST_ITERATE(1,,)
for( i=0; i<iSize; i+=2 )
{
cTemp = _amemd8( (void*) (axI + i ) ); // load next complex number
fTmpRe = cResult.re * RE( cTemp ) - cResult.im * IM( cTemp ); // line 57
cResult.im = cResult.re * IM( cTemp ) + cResult.im * RE( cTemp ); // line 58
cResult.re = fTmpRe;
}
return cResult;
} // END ProductCmplx
Quite straight forward, I think. I would expect, that many of the multiplications (and the add and sub) can go parallel, but the assembler output is disappointing:
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : A.cpp
;* Loop source line : 53
;* Loop opening brace source line : 54
;* Loop closing brace source line : 60
;* Known Minimum Trip Count : 1
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 11
;* Unpartitioned Resource Bound : 2
;* Partitioned Resource Bound(*) : 4
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 1 0
;* .M units 2 2
;* .X cross paths 2 4*
;* .T address paths 1 0
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 1 1 (.L or .S unit)
;* Addition ops (.LSD) 1 4 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 1
;* Bound(.L .S .D .LS .LSD) 1 2
;*
;* Searching for software pipeline schedule at ...
;* ii = 11 Schedule found with 2 iterations in parallel
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1
;*----------------------------------------------------------------------------*
$C$L1: ; PIPED LOOP PROLOG
.dwpsn file "A.cpp",line 53,column 0,is_stmt,isa 0
SPLOOPD 11 ;22 ; (P)
|| MV .L1 A4,A6
|| MVC .S2 B4,ILC
;** --------------------------------------------------------------------------*
$C$L2: ; PIPED LOOP KERNEL
$C$DW$L$_ZN12A12ProductCmplxEPKfi$3$B:
.dwpsn file "A.cpp",line 54,column 0,is_stmt,isa 0
LDDW .D1T1 *A6++,A5:A4 ; |A.cpp:57| (P) <0,0>
NOP 3
SPMASK L2
|| ZERO .L2 B4
SPMASK L1,S2
|| ZERO .L1 A3 ; |A.cpp:45|
|| SET .S2 B4,0x17,0x1d,B6
|| MV .L2X A4,B5 ; |A.cpp:57| (P) <0,5> Define a twin register
SPMASK L2
|| ZERO .L2 B4 ; |A.cpp:45|
|| MPYSP .M2X B6,A4,B4 ; |A.cpp:57| (P) <0,6>
|| MPYSP .M1 A3,A5,A4 ; |A.cpp:57| (P) <0,6>
MPYSP .M1X B6,A5,A3 ; |A.cpp:58| (P) <0,7>
MPYSP .M2 B4,B5,B4 ; |A.cpp:58| (P) <0,8> ^
NOP 2
SUBSP .L2X B4,A4,B4 ; |A.cpp:57| <0,11>
NOP 1
ADDSP .L1X B4,A3,A3 ; |A.cpp:58| <0,13> ^
NOP 1
MV .L2 B4,B5 ; |A.cpp:59| <0,15>
MV .S2 B4,B6 ; |A.cpp:59| <0,16>
MV .L1 A3,A4 ; |A.cpp:58| <0,17>
.dwpsn file "A.cpp",line 60,column 0,is_stmt,isa 0
SPKERNEL 0,0
|| MV .L2X A3,B4 ; |A.cpp:58| <0,18> ^ Define a twin register
I don't understand why the multiplications in source line "A.cpp:58" can't run in parellel. I read the optimizer manual up and down, and the "Loop Carried Dependency Bound = 11 " seems to indicate a pointer aliasing problem, but I can't see why. I have a similar "Multiply complex vector by complex vector" routine, where the multiplications look just fine:
MPYSP .M1X B7,A4,A3 ; |A.cpp:93| (P) <0,5>
|| MPYSP .M2X B7,A5,B7 ; |A.cpp:93| (P) <0,5>
MPYSP .M1X B6,A5,A16 ; A.cpp:93| (P) <0,6>
|| MPYSP .M2X B6,A4,B6 ; |A.cpp:93| (P) <0,6>
NOP 3
Oh, and by the way: what means "Define a twin register"?
Could you please help me?
Thanks,
Markus