Tool/software: TI C/C++ Compiler
Hi All
I am observing an issue w.r.t optimization of FEC decoding algorithm.
I have two versions of the optimized code. For convenience I am referring them here as version 1 (less optimized) and version 2 (more optimized).
It is very obvious by looking at the SOFTWARE PIPELINE INFORMATION of each of them, version 2 is more optimized.
There is no change in memory map between two versions.
But strangely version 2 is consuming 40% more cycles than version 1. Hence I am at lost to explain this behavior.
I would appreciate if somebody can throw light on this issue and help me resolving this issue.
Here I am attaching the software pipeline report and for loop source code of both the versions with appropriate file names.
Please let me know if you need any other information.
Best Regards
Rao
pi32bmAddrBuf1 = &bm[0][i]; // &bm[0][i]
#pragma UNROLL(1)
for (k = 0, m = 0; k < (NO_OF_STATES/2); k++, m += 2)
{
ui32packedStates = _amem4(pi32softIdealOp++);
path1 = _amem4(pi32bmAddrBuf1);
pi32bmAddrBuf1 += MAX_DECODER_OP_LENGTH_PLUS1;
path2 = _amem4(pi32bmAddrBuf1);
pi32bmAddrBuf1 += MAX_DECODER_OP_LENGTH_PLUS1;
i64TmpVar = _ddotp4(i32_decoderIP01, ui32packedStates);
i32Tmp1 = _loll(i64TmpVar);
i32Tmp2 = _hill(i64TmpVar);
path1 += i32Tmp1;
path2 += i32Tmp2;
if (path1 > path2)
{
*pui8bestState1 = (m);
_amem4(pi32bmAddrBuf2) = path1;
}
else
{
*pui8bestState1 = (m+1);
_amem4(pi32bmAddrBuf2) = path2;
}
pui8bestState1 += MAX_DECODER_OP_LENGTH;
pi32bmAddrBuf2 += MAX_DECODER_OP_LENGTH_PLUS1;
}
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../ViterbiDecoding.c
;* Loop source line : 297
;* Loop opening brace source line : 298
;* Loop closing brace source line : 334
;* Known Minimum Trip Count : 32
;* Known Maximum Trip Count : 32
;* Known Max Trip Count Factor : 32
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 4
;* Partitioned Resource Bound(*) : 4
;* Resource Partition:
;* A-side B-side
;* .L units 1 0
;* .S units 3 4*
;* .D units 4* 3
;* .M units 1 0
;* .X cross paths 1 1
;* .T address paths 3 4
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 1 3 (.L or .S or .D unit)
;* Bound(.L .S .LS) 2 2
;* Bound(.L .S .D .LS .LSD) 3 4*
;*
;* Searching for software pipeline schedule at ...
;* ii = 4 Schedule found with 4 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | ***** * | * * ** * |
;* 1: |* ******* ** | ****** * |
;* 2: |* ***** ** | ****** * |
;* 3: |* ****** * | *** ** * |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh4
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 3
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.375, max 3.000 }
;* Mem bank perf. penalty (est.) : 8.6%
;*
;* Effective ii : { min 4.00, est 4.38, max 7.00 }
;*
;*
;* Total cycles (est.) : 12 + min_trip_cnt * 4 = 140
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV A4,B4 ; []
;* MV B6,A3 ; []
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C539:
;* 0 LDW .D1T1 *A5++(4),A8 ; [A_D64P] |313|
;* 1 NOP 3 ; [A_L674]
;* 4 LDW .D2T2 *B9(0),B7 ; [B_D64P] |313|
;* || ADDK .S2 14408,B9 ; [B_Sb674] |313|
;* 5 DDOTP4 .M1 A7,A8,A17:A16 ; [A_M674] |313|
;* 6 LDW .D1T1 *A6(0),A8 ; [A_D64P] |314|
;* 7 NOP 1 ; [A_L674]
;* 8 ADDK .S1 14408,A6 ; [A_S674] |314|
;* 9 NOP 1 ; [A_L674]
;* 10 ADD .L2X B7,A16,B8 ; [B_L674] |313|
;* 11 ADD .L1 A8,A17,A9 ; [A_L674] |314|
;* 12 CMPGT .L1X B8,A9,A0 ; [A_L674] |316|
;* || ADD .L2 1,B16,B5 ; [B_L674] |326|
;* 13 [!A0] STB .D2T2 B5,*B6(0) ; [B_D64P] |326|
;* || ADDK .S2 1800,B6 ; [B_Sb674] |326|
;* || [!A0] STW .D1T1 A9,*A4(0) ; [A_D64P] |327|
;* || ADDK .S1 7204,A4 ; [A_S674] |327|
;* || ADD .L2 2,B16,B5 ; [B_L674] |297| ^
;* 14 [ A0] STW .D2T2 B8,*B4(0) ; [B_D64P] |320|
;* || ADDK .S2 7204,B4 ; [B_Sb674] |320|
;* 15 [ A0] STB .D1T2 B16,*A3(0) ; [A_D64P] |319|
;* || ADDK .S1 1800,A3 ; [A_S674] |319|
;* || EXTU .S2 B5,24,24,B16 ; [B_Sb674] |297| ^
;* || SPBR $C$C539 ; []
;* 16 ; BRANCHCC OCCURS {$C$C539} ; [] |297|
;*----------------------------------------------------------------------------*
pi64bmAddrBuf1 = (Int64 *)&bm[i][0]; // &bm[0][i]
#pragma UNROLL(1)
for (k = 0, m = 0; k < (NO_OF_STATES>>1); k++, m += 2)
{
ui32packedStates = _amem4(pi32softIdealOp++);
i64TmpVar = _amem8(pi64bmAddrBuf1++);
path1 = _loll(i64TmpVar);
path2 = _hill(i64TmpVar);
i64TmpVar = _ddotp4(i32_decoderIP01, ui32packedStates);
i32Tmp1 = _loll(i64TmpVar);
i32Tmp2 = _hill(i64TmpVar);
path1 += i32Tmp1;
path2 += i32Tmp2;
if (path1 > path2)
{
*pui8bestState1 = (m);
_amem4(pi32bmAddrBuf2++) = path1;
}
else
{
*pui8bestState1 = (m+1);
_amem4(pi32bmAddrBuf2++) = path2;
}
pui8bestState1 += MAX_DECODER_OP_LENGTH;
}
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../ViterbiDecoding.c
;* Loop source line : 300
;* Loop opening brace source line : 301
;* Loop closing brace source line : 333
;* Known Minimum Trip Count : 32
;* Known Maximum Trip Count : 32
;* Known Max Trip Count Factor : 32
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 3
;* Partitioned Resource Bound(*) : 3
;* Resource Partition:
;* A-side B-side
;* .L units 1 0
;* .S units 1 3*
;* .D units 3* 3*
;* .M units 1 0
;* .X cross paths 1 0
;* .T address paths 3 3
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 2 2 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 2
;* Bound(.L .S .D .LS .LSD) 3* 3*
;*
;* Searching for software pipeline schedule at ...
;* ii = 3 Schedule found with 5 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: |* ******* *** | ** *** |
;* 1: |* ******* * * | ** *** |
;* 2: |* ****** * * | ****** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh8
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 3
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.375, max 2.000 }
;* Mem bank perf. penalty (est.) : 11.1%
;*
;* Effective ii : { min 3.00, est 3.38, max 5.00 }
;*
;*
;* Total cycles (est.) : 12 + min_trip_cnt * 3 = 108
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV B7,B5 ; []
;* MV B5,A4 ; [] Define a LI twin register
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C698:
;* 0 LDW .D2T2 *B8++(4),B6 ; [B_D64P] |313|
;* 1 NOP 1 ; [A_L674]
;* 2 LDDW .D1T1 *A5++(8),A9:A8 ; [A_D64P] |313|
;* 3 NOP 2 ; [A_L674]
;* 5 DDOTP4 .M1X A6,B6,A17:A16 ; [A_M674] |313|
;* 6 NOP 3 ; [A_L674]
;* 9 ADD .S1 A8,A16,A16 ; [A_S674] |313|
;* || ADD .L1 A9,A17,A18 ; [A_L674] |314|
;* 10 CMPGT .L1 A16,A18,A0 ; [A_L674] |316|
;* || ADD .L2 2,B9,B4 ; [B_L674] |300| ^
;* 11 MV .L1 A16,A3 ; [A_L674] |313| Split a long life
;* || ADD .L2 1,B9,B4 ; [B_L674] |326|
;* || [ A0] STB .D2T2 B9,*B5(0) ; [B_D64P] |319|
;* || EXTU .S2 B4,24,24,B9 ; [B_Sb674] |300| ^
;* 12 [!A0] STW .D1T1 A18,*A7++(4) ; [A_D64P] |327|
;* || ADDK .S2 1800,B5 ; [B_Sb674] |319|
;* 13 [!A0] STB .D2T2 B4,*B7(0) ; [B_D64P] |326|
;* || ADDK .S2 1800,B7 ; [B_Sb674] |326|
;* || ADDK .S1 1800,A4 ; [A_S674] |319|
;* || [ A0] STW .D1T1 A3,*A7++(4) ; [A_D64P] |320|
;* || SPBR $C$C698 ; []
;* 14 NOP 1 ; [A_L674]
;* 15 ; BRANCHCC OCCURS {$C$C698} ; [] |300|
;*----------------------------------------------------------------------------*