OMAP3530 algorithms are running slowly

Hello,

I have an omap3530 board and I have implemented some algorithms, which run on DSP-side. ARM and DSP communcate using DSPLink and shared memory. But the performance is not good. For example for motion estimation for a frame 320x240 and window search [-8 8], the algorithm takes 12!! secs to finish.  I believe that is a problem with memory, because the frame  has been saved to external memory and the data copy from the shared memory to external and then process them. Is there any manner to transfer data to a local memory and improve performance or any other way to improve performance ?? The DSP/BIOS version is 5.x. Could anyone help me??

This is the code

#pragma DATA_SECTION( FrmBuf, "mySection" );

struct Image{
    unsigned char r,g,b;
    unsigned char  y,u,v;
    unsigned char Dy;//for every block//
}FrmBuf[344064];#pragma DATA_SECTION( blk, "mySection" );
struct Block {
    unsigned char r,g,b;
    unsigned char  y,u,v;
  
}blk[256];//define the max block which examine for fire 16*16//

int SAD(int offset_hor,int offset_vert,int mvx,int mvy,struct Block *blk,struct Image *img,unsigned int BLOCK_X,unsigned int BLOCK_Y,int imgWidth){
    int sum=0,i=0,j=0;
    int val1=0,val2=0,diff=0;
    for(j=0;j<BLOCK_Y;j++){
     for(i=0;i<BLOCK_X;i++) {
       val1=blk[j*BLOCK_X+i].y;
       val2=img[(offset_vert+(j+mvy)*imgWidth)+i+offset_hor+mvx].y;
     //  printf("reference FrmBuf[%d]->%d\n",(offset_vert+(j+mvy)*imgWidth)+i+offset_hor+mvx,FrmBuf[(offset_vert+(j+mvy)*imgWidth)+i+offset_hor+mvx].y);
     //  printf("current blk[%d]->%d\n",j*BLOCK_X+i,blk[j*BLOCK_X+i].y);
       //printf("values::[%d %d]][%d %d]\n",j*BLOCK_X+i,(offset_vert+(j+mvy)*imgWidth)+i+offset_hor+mvx,val1,val2);
       diff=(val1-val2);
       diff=(diff ^ (diff>>31)) - (diff>>31);
       
       sum+=diff ;
    }
    
    }
    return sum;
 }

void   MotionEstimation(int *min_mvy,int *min_mvx,int offset_hor,int offset_vert,struct Block *blk,struct Image *img,unsigned int BLOCK_X,unsigned int BLOCK_Y,int imgWidth)
 {
    //
     int temp_SAD=0;
     int mvy=0;
     int mvx=0;
     int min_SAD=65281;//the max value for diff +1 (255-0)*256
     int Y=(Y_BLOCK<<1);
     int X=(X_BLOCK<<1);
     int diff1=0;
     int diff2=0;
     for(mvy=-Y;mvy<Y;mvy++){
     for(mvx=-X;mvx<X;mvx++){
       //   printf("MotionVector[%d %d] :\n",mvy,mvx);
          temp_SAD=SAD(offset_hor,offset_vert,mvx,mvy,blk,img,BLOCK_X,BLOCK_Y,imgWidth);
          diff1=temp_SAD-min_SAD;
          diff2=min_SAD-temp_SAD;
          diff1=diff1>>31;//take the sign
          diff2=diff2>>31;//take the sign
          diff1=((diff1^1)&1)*0xFFFF;
           diff2=((diff2^1)&1)*0xFFFF;
        
          min_SAD=((min_SAD&diff1)|(temp_SAD&diff2));
        
          *min_mvx=((*min_mvx&diff1)|(mvx&diff2));
          *min_mvy=((*min_mvy&diff1)|(mvy&diff2));
     }
       
    }
    
   // printf("MV::[%d %d]\n",*min_mvy,*min_mvx);
}

Best Regards

Giorgos Tsoumplekas

Postgraduate student, University of Athens