This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

EDMA cost more time than memcpy()

I use EDMA3 to move a frame video data  from DDR to DDR,it costs 39ms,but the  memcpy()  costs only 7ms.Why? could you help me?

My EDMA programe is shown as belown:

#define IMG_WIDTH  (720)
#define IMG_HEIGHT (576)
#define EDMA3_CACHE_LINE_SIZE_IN_BYTES      (128u)

// buffer in L1 SRAM, 32K
#pragma DATA_SECTION( bufL1, ".L1Buffer" )
#pragma DATA_ALIGN( bufL1, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufL1[0x8000];

// buffer in L2 SRAM, 64K
#pragma DATA_SECTION( bufL2, ".L2Buffer" )
#pragma DATA_ALIGN( bufL2, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufL2[0x10000];

// buffer in DDR2, 32M
#pragma DATA_SECTION( bufExt1, ".ExtBuffer" )
#pragma DATA_ALIGN( bufExt1, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufExt1[0x2000000];

// buffer in DDR2, 32M
#pragma DATA_SECTION( bufExt2, ".ExtBuffer" )
#pragma DATA_ALIGN( bufExt2, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufExt2[0x2000000];

void main() {

  // set entire external memory to be cacheable (256M)
  BCACHE_setMar( (Ptr)(0x80000000), 0x10000000, BCACHE_MAR_ENABLE );

  edma3init();
  return;
}

void edma3_test() {

  // Edma Test for memory to memory copy
  LOG_printf( &trace, "Starting EDMA3 examples!\n");

  if ( test_2D2D( hEdma, bufExt1, bufExt2 ) != EDMA3_DRV_SOK ) {
    LOG_printf( &trace, "test_2D2D() to L2 mem failed!\n");
  } else {
    LOG_printf( &trace, "test_2D2D() to L2 mem passed!\n");
  }

LOG_printf( &trace, "All EDMA3 examples are done!\n");

  return;
}
//======================================================================
int check_2D2D( signed char *src, signed char *dst ) {
  int i, j;

  for ( i = 0; i < IMG_HEIGHT; i++ )
  {
    for ( j = 0; j < IMG_WIDTH*2 ; j++ ){
   if ( src[j] != dst[j] ) {
  return -1;
   }
}
  src += IMG_WIDTH*2;
dst += IMG_WIDTH*2;

}

  return 0;
}

int test_2D2D( EDMA3_DRV_Handle hEdma,
               signed char *src, signed char *dst ) {
  unsigned int chId = 128;
  unsigned int tcc = 0;
  int i, j;
  int fail = 0;
  signed char *src1, *dst1;
  unsigned int ts, te;
   Uint32 startime, endtime;

  // prepare data for src and dst buffer
  src1 = src;
  dst1 = dst;
  for ( i = 0; i < IMG_HEIGHT; i++ )
  {
    for ( j = 0; j < IMG_WIDTH*2 ; j++ ){
   src1[j] = j;
   dst1[j] = 0;
}
  src1 += IMG_WIDTH*2;
dst1 += IMG_WIDTH*2;

  }

  LOG_printf( &trace, "test_2D2D() started:" );

  evtMiss = 0; // clear event miss count

  ts = C64P_getltime();
  // write back src
  BCACHE_wb ( (void *)src, IMG_WIDTH*2*IMG_HEIGHT, 1 ); // wait for it to finish
  // invalidate dst
  BCACHE_inv( (void *)dst, IMG_WIDTH*2*IMG_HEIGHT, 1 ); // wait for it to finish
  te = C64P_getltime();
  LOG_printf( &trace, "    cache operation cycles = %u", te - ts );

  startime = CLK_getltime();
   // request channel
  tcc = EDMA3_DRV_TCC_ANY;
  chId = EDMA3_DRV_DMA_CHANNEL_ANY;
  if( EDMA3_DRV_requestChannel ( hEdma, &chId, &tcc, (EDMA3_RM_EventQueue)0,
                                 &callback_my, &myCbData[0] ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }

  // Fill the PaRAM Set for transferring Y
  // Ideally we only need to do a 2-D transfer: acnt = 1 and bcnt = 720*480.
  // however, since bcnt and bindex has to be 16 bits, the transfer is
  // splited to 3-D transfer: acnt = 1, bcnt = 720, ccnt = 480
 
  myCbData[0].chId = chId;
  myCbData[0].hEdma = hEdma;
  myCbData[0].numTrs = IMG_HEIGHT;//1;//
  myCbData[0].numTrCnt = 0;

  if ( EDMA3_DRV_setSrcParams ( hEdma, chId, (unsigned int)(src),
                                EDMA3_DRV_ADDR_MODE_INCR,
                                EDMA3_DRV_W8BIT ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setDestParams ( hEdma, chId, (unsigned int)(dst),
                                 EDMA3_DRV_ADDR_MODE_INCR,
                                 EDMA3_DRV_W8BIT ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setSrcIndex ( hEdma, chId, 1, IMG_WIDTH*2 ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setDestIndex ( hEdma, chId, 1, IMG_WIDTH*2 ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setTransferParams ( hEdma, chId, 1, IMG_WIDTH*2, IMG_HEIGHT, IMG_WIDTH*2,
                                     EDMA3_DRV_SYNC_AB ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setOptField ( hEdma, chId, EDMA3_DRV_OPT_FIELD_TCINTEN, 1u )
                               != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }
  if ( EDMA3_DRV_setOptField ( hEdma, chId, EDMA3_DRV_OPT_FIELD_ITCINTEN, 1u )
                               != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
  }

  trFinished = 0;
  ts = C64P_getltime();
  if ( EDMA3_DRV_enableTransfer ( hEdma, chId, EDMA3_DRV_TRIG_MODE_MANUAL )
       != EDMA3_DRV_SOK ) {
fail = -2;
goto func_return;
  }
  while ( trFinished == 0 ); // wait for transferring Y to finish
  endtime = CLK_getltime();
  te = C64P_getltime();
  LOG_printf(&trace,"edma3 costs time =%d",endtime-startime);
  LOG_printf( &trace, "    Tran(Y) cycles = %u", te - ts );

  // checking correctness
  if ( check_2D2D( src, dst ) ) {
    fail = -3; 
    goto func_return;
  }

  if ( evtMiss ) { // is there any event miss?
    fail = -2;
    goto func_return;
  }
//==================================================
    startime = CLK_getltime();

   memcpy(dst,src, IMG_WIDTH*2*IMG_HEIGHT);
  endtime = CLK_getltime();
  LOG_printf(&trace,"memcpy costs time =%d",endtime-startime);
//==================================================
   return (1);
func_return:
/*  if ( chId != EDMA3_DRV_DMA_CHANNEL_ANY ) {
     EDMA3_DRV_freeChannel ( hEdma, chId );
  }
  */
  return fail;
}

  • Hi,

    Are you still facing the issue you mentioned?

    In memcpy you are passing the dst and src buffs for data transfer which are cacheble area hence the chche flush is required for the writeback to the DDR locations. And the EDMA hardware will write to the DDR locations directly you might be seeing this difference.

    Regarding the EDMA programming you are transferring acnt * bcnt bytes per transfer trigger since your acnt is 1, and you need ccnt number of triggers i am not sure from the code attached if you are seeing the full transfer of acnt * bcnt * ccnt bytes. maybe you can rearrange the PaRAMs making ccnt as 1 and transfer whole data at once, since for smaller data EDMALLD overhead will be more hence you might get slightly better performance for EDMA transfers if you transfer at once.

    Also mention on which platform you are running and the EDMA3LLD and BIOS versions.

    Regards,
    Prasad

  • Hi,

    I am novice in this EDMA. Just wanted to know if you are doing

    "data transfer from one chip DDR to another chip DDR"

    or

    "data transfer from one chip DDR to same chip DDR" [If so, why is EDMA required, can't i relocate the contents in memory directly..i mean define a array/structure and pushing it to a a certain memory location say xyz[0] is sitting at 0x80000024 and pqr[0] is sitting at 0x80000056 and then do copy directly, is there any advantage i get with edma]

    can you give more details on what you are trying to do [i am learning very much here,]

    Thanks

    RC Reddy

  • I understand that memcpy involves processor and takes more cycles or time. is there any comparison of throughput of the memcpy v/s EDMA in any Ti forums [is that kind of comparison apt ?].

    Thanks

    RC Reddy