there are two simple functions, used for test.
the test result is very interesting.
dm642 | dm6446 | |
testspeed | 4-5ms | 4-5ms |
testspeed2 | 1ms | 4-5ms |
I don't know why the testspeed2 is so fast on dm642, and how can I achieve it on dm6446. I used -o3 option.
ptotal = (unsigned char *)memalign(352*288*8,128);
void testspeed(unsigned char * _pbgTotal)
{
uchar * restrict pimg0 = _pbgTotal;
uchar * restrict pimg1 = pimg0+64;
uchar * restrict pimg2 = pimg0+64*2;
uchar * restrict pimg3 = pimg0+64*3;
uchar * restrict pimg4 = pimg0+64*4;
uchar * restrict pimg5 = pimg0+64*5;
uchar * restrict pimg6 = pimg0+64*6;
uchar * restrict pimg7 = pimg0+64*7;
int i,j;
int len = 352*288;
int steplen = 64;
int deltastep = 64*7;
int addone = 1<<24 | 1<<16 | 1<<8 | 1;
for(j=0;j<(len/steplen);j++)
{
for(i = 0;i<steplen;i+=4)
{
_amem4(pimg0) = _add4(_amem4_const(pimg0),addone);
_amem4(pimg1) = _add4(_amem4_const(pimg1),addone);
_amem4(pimg2) = _add4(_amem4_const(pimg2),addone);
_amem4(pimg3) = _add4(_amem4_const(pimg3),addone);
_amem4(pimg4) = _add4(_amem4_const(pimg4),addone);
_amem4(pimg5) = _add4(_amem4_const(pimg5),addone);
_amem4(pimg6) = _add4(_amem4_const(pimg6),addone);
_amem4(pimg7) = _add4(_amem4_const(pimg7),addone);
pimg0+=4;pimg1+=4;pimg2+=4;pimg3+=4;
pimg4+=4;pimg5+=4;pimg6+=4;pimg7+=4;
}
pimg0+=deltastep;pimg1+=deltastep;pimg2+=deltastep;pimg3+=deltastep;
pimg4+=deltastep;pimg5+=deltastep;pimg6+=deltastep;pimg7+=deltastep;
}
///////////////////////////////////////////////////////////////////////////////////////
void testspeed2(unsigned char * _pbgTotal)
{
int len = 352*288;
uchar * restrict pimg0 = _pbgTotal;
uchar * restrict pimg1 = pimg0+len;
uchar * restrict pimg2 = pimg0+len*2;
uchar * restrict pimg3 = pimg0+len*3;
uchar * restrict pimg4 = pimg0+len*4;
uchar * restrict pimg5 = pimg0+len*5;
uchar * restrict pimg6 = pimg0+len*6;
uchar * restrict pimg7 = pimg0+len*7;
int i,j;
int steplen = 64;
int deltastep = 64*7;
int addone = 1<<24 | 1<<16 | 1<<8 | 1;
for(j=0;j<(len/steplen);j++)
{
for(i = 0;i<steplen;i+=4)
{
_amem4(pimg0) = _add4(_amem4_const(pimg0),addone);
_amem4(pimg1) = _add4(_amem4_const(pimg1),addone);
_amem4(pimg2) = _add4(_amem4_const(pimg2),addone);
_amem4(pimg3) = _add4(_amem4_const(pimg3),addone);
_amem4(pimg4) = _add4(_amem4_const(pimg4),addone);
_amem4(pimg5) = _add4(_amem4_const(pimg5),addone);
_amem4(pimg6) = _add4(_amem4_const(pimg6),addone);
_amem4(pimg7) = _add4(_amem4_const(pimg7),addone);
pimg0+=4;
pimg1+=4;pimg2+=4;pimg3+=4;
pimg4+=4;pimg5+=4;pimg6+=4;pimg7+=4;
}
}
}
}