why the optimization result are different, which are generated from two similiar code?
The following codes(ByteAdd1 and ByteAdd2) are similiar, build them and optimize with o3.
C Language code is:
typedef struct tagMYBUFFER{
unsigned int size_of_byte;
unsigned char *data_ptr;
}MYBUFFER;
void ByteAdd1(MYBUFFER *src, MYBUFFER *dest)
{
const unsigned char *src_ptr = src->data_ptr;
const unsigned char *src_end_ptr = src_ptr + src->size_of_byte;
unsigned char *dest_ptr = dest->data_ptr;
for(; (unsigned int)src_ptr < (unsigned int)src_end_ptr;)
{
dest_ptr[0] = src_ptr[0] + src_ptr[1];
dest_ptr[1] = src_ptr[2] + src_ptr[3];
dest_ptr[2] = src_ptr[4] + src_ptr[5];
dest_ptr[3] = src_ptr[6] + src_ptr[7];
dest_ptr+=4;
src_ptr+=8;
}
}
void ByteAdd2(unsigned char *src, unsigned int size_of_byte, unsigned char *dest)
{
const unsigned char *src_ptr = src;
const unsigned char *src_end_ptr = src_ptr + size_of_byte;
unsigned char *dest_ptr = dest;
for(; (unsigned int)src_ptr < (unsigned int)src_end_ptr;)
{
dest_ptr[0] = src_ptr[0] + src_ptr[1];
dest_ptr[1] = src_ptr[2] + src_ptr[3];
dest_ptr[2] = src_ptr[4] + src_ptr[5];
dest_ptr[3] = src_ptr[6] + src_ptr[7];
dest_ptr+=4;
src_ptr+=8;
}
}
In ByteAdd1, the "for" loop section would be compiled into following code:
;*----------------------------------------------------------------------------*
$C$L30: ; PIPED LOOP PROLOG
;** --------------------------------------------------------------------------*
$C$L31: ; PIPED LOOP KERNEL
LDBU .D1T1 *A4,A3 ; |255| <0,0> ^
LDBU .D1T1 *+A4(1),A8 ; |255| <0,1> ^
NOP 4
ADD .L1 A8,A3,A3 ; |255| <0,6> ^
STB .D1T1 A3,*++A5(4) ; |255| <0,7> ^
LDBU .D2T2 *-B4(3),B6 ; |256| <0,8> ^
LDBU .D2T2 *-B4(2),B7 ; |256| <0,9> ^
NOP 4
ADD .L2 B7,B6,B6 ; |256| <0,14> ^
STB .D2T2 B6,*-B5(1) ; |256| <0,15> ^
LDBU .D2T2 *-B4(1),B6 ; |257| <0,16> ^
LDBU .D2T2 *B4++(8),B7 ; |257| <0,17> ^
NOP 3
ADD .L1 8,A4,A4 ; |253| <0,21>
CMPLTU .L1 A4,A7,A0 ; |253| <0,22>
|| MV .S1 A4,A3 ; |253| <0,22> Split a long life(pre-sched)
|| ADD .L2 B7,B6,B6 ; |257| <0,22> ^
STB .D2T2 B6,*B5++(4) ; |257| <0,23> ^
LDBU .D1T1 *+A6(6),A3 ; |258| <0,24> ^
MV .L1 A3,A6 ; |253| <0,25> Split a long life(pre-sched)
|| LDBU .D1T1 *+A6(7),A8 ; |258| <0,25> ^
[ A0] BNOP .S2 $C$L31,3 ; |253| <0,26>
ADD .L1 A8,A3,A3 ; |258| <0,30> ^
STB .D1T1 A3,*+A5(3) ; |258| <0,31> ^
;** --------------------------------------------------------------------------*
$C$L32: ; PIPED LOOP EPILOG
;** --------------------------------------------------------------------------*
In ByteAdd2, the "for" loop section would be compiled into following code:
;*----------------------------------------------------------------------------*
$C$L26: ; PIPED LOOP PROLOG
;** --------------------------------------------------------------------------*
$C$L27: ; PIPED LOOP KERNEL
[ A0] B .S2 $C$L27 ; |269| <0,6>
|| [ A0] LDBU .D2T2 *-B4(2),B9 ; |272| <1,0>
|| [ A0] LDBU .D1T1 *A3,A8 ; |271| <1,0>
ADD .L2 B9,B8,B16 ; |272| <0,7>
|| ADD .L1 A17,A6,A5 ; |274| <0,7>
|| [ A0] LDBU .D2T2 *-B4(3),B8 ; |272| <1,1>
|| [ A0] LDBU .D1T1 *+A3(7),A17 ; |274| <1,1>
STB .D2T2 B16,*-B5(1) ; |272| <0,8>
|| [ A0] LDBU .D1T1 *+A3(6),A6 ; |274| <1,2>
ADD .L1 A9,A8,A7 ; |271| <0,9>
|| [ A0] LDBU .D2T2 *-B4(1),B6 ; |273| <1,3>
|| [ A0] LDBU .D1T1 *+A3(1),A9 ; |271| <1,3>
ADD .L2 B7,B6,B16 ; |273| <0,10>
|| STB .D1T1 A7,*++A4(4) ; |271| <0,10>
|| [ A0] LDBU .D2T2 *B4++(8),B7 ; |273| <1,4>
|| ADD .L1 8,A3,A3 ; |269| <1,4>
STB .D2T2 B16,*B5++(4) ; |273| <0,11>
|| STB .D1T1 A5,*+A4(3) ; |274| <0,11>
|| CMPLTU .L1 A3,A16,A0 ; |269| <1,5>
;** --------------------------------------------------------------------------*
$C$L28: ; PIPED LOOP EPILOG
;** --------------------------------------------------------------------------*
Obviously, in ByteAdd1, the loop part is not parallel processing.
But, in ByteAdd2, it used parallel processing.
So, why the assumbly code are so different, when the source C code are almost the same?