EDMA cost more time than memcpy()

zhao yanjie

I use EDMA3 to move a frame video data from DDR to DDR,it costs 39ms,but the memcpy() costs only 7ms.Why? could you help me?

My EDMA programe is shown as belown:

#define IMG_WIDTH (720)
#define IMG_HEIGHT (576)
#define EDMA3_CACHE_LINE_SIZE_IN_BYTES (128u)

// buffer in L1 SRAM, 32K
#pragma DATA_SECTION( bufL1, ".L1Buffer" )
#pragma DATA_ALIGN( bufL1, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufL1[0x8000];

// buffer in L2 SRAM, 64K
#pragma DATA_SECTION( bufL2, ".L2Buffer" )
#pragma DATA_ALIGN( bufL2, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufL2[0x10000];

// buffer in DDR2, 32M
#pragma DATA_SECTION( bufExt1, ".ExtBuffer" )
#pragma DATA_ALIGN( bufExt1, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufExt1[0x2000000];

// buffer in DDR2, 32M
#pragma DATA_SECTION( bufExt2, ".ExtBuffer" )
#pragma DATA_ALIGN( bufExt2, EDMA3_CACHE_LINE_SIZE_IN_BYTES );
signed char bufExt2[0x2000000];

void main() {

// set entire external memory to be cacheable (256M)
BCACHE_setMar( (Ptr)(0x80000000), 0x10000000, BCACHE_MAR_ENABLE );

edma3init();
return;
}

void edma3_test() {

// Edma Test for memory to memory copy
LOG_printf( &trace, "Starting EDMA3 examples!\n");

if ( test_2D2D( hEdma, bufExt1, bufExt2 ) != EDMA3_DRV_SOK ) {
LOG_printf( &trace, "test_2D2D() to L2 mem failed!\n");
} else {
LOG_printf( &trace, "test_2D2D() to L2 mem passed!\n");
}

LOG_printf( &trace, "All EDMA3 examples are done!\n");

return;
}
//======================================================================
int check_2D2D( signed char *src, signed char *dst ) {
int i, j;

for ( i = 0; i < IMG_HEIGHT; i++ )
{
    for ( j = 0; j < IMG_WIDTH*2 ; j++ ){
   if ( src[j] != dst[j] ) {
return -1;
   }
}
src += IMG_WIDTH*2;
dst += IMG_WIDTH*2;

}

return 0;
}

int test_2D2D( EDMA3_DRV_Handle hEdma,
signed char *src, signed char *dst ) {
unsigned int chId = 128;
unsigned int tcc = 0;
int i, j;
int fail = 0;
signed char *src1, *dst1;
unsigned int ts, te;
Uint32 startime, endtime;

// prepare data for src and dst buffer
src1 = src;
dst1 = dst;
for ( i = 0; i < IMG_HEIGHT; i++ )
{
    for ( j = 0; j < IMG_WIDTH*2 ; j++ ){
   src1[j] = j;
   dst1[j] = 0;
}
src1 += IMG_WIDTH*2;
dst1 += IMG_WIDTH*2;

}

LOG_printf( &trace, "test_2D2D() started:" );

evtMiss = 0; // clear event miss count

ts = C64P_getltime();
// write back src
BCACHE_wb ( (void *)src, IMG_WIDTH*2*IMG_HEIGHT, 1 ); // wait for it to finish
// invalidate dst
BCACHE_inv( (void *)dst, IMG_WIDTH*2*IMG_HEIGHT, 1 ); // wait for it to finish
te = C64P_getltime();
LOG_printf( &trace, " cache operation cycles = %u", te - ts );

startime = CLK_getltime();
// request channel
tcc = EDMA3_DRV_TCC_ANY;
chId = EDMA3_DRV_DMA_CHANNEL_ANY;
if( EDMA3_DRV_requestChannel ( hEdma, &chId, &tcc, (EDMA3_RM_EventQueue)0,
&callback_my, &myCbData[0] ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}

// Fill the PaRAM Set for transferring Y
// Ideally we only need to do a 2-D transfer: acnt = 1 and bcnt = 720*480.
// however, since bcnt and bindex has to be 16 bits, the transfer is
// splited to 3-D transfer: acnt = 1, bcnt = 720, ccnt = 480

myCbData[0].chId = chId;
myCbData[0].hEdma = hEdma;
myCbData[0].numTrs = IMG_HEIGHT;//1;//
myCbData[0].numTrCnt = 0;

if ( EDMA3_DRV_setSrcParams ( hEdma, chId, (unsigned int)(src),
                                EDMA3_DRV_ADDR_MODE_INCR,
                                EDMA3_DRV_W8BIT ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setDestParams ( hEdma, chId, (unsigned int)(dst),
                                 EDMA3_DRV_ADDR_MODE_INCR,
                                 EDMA3_DRV_W8BIT ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setSrcIndex ( hEdma, chId, 1, IMG_WIDTH*2 ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setDestIndex ( hEdma, chId, 1, IMG_WIDTH*2 ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setTransferParams ( hEdma, chId, 1, IMG_WIDTH*2, IMG_HEIGHT, IMG_WIDTH*2,
                                     EDMA3_DRV_SYNC_AB ) != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setOptField ( hEdma, chId, EDMA3_DRV_OPT_FIELD_TCINTEN, 1u )
                               != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}
if ( EDMA3_DRV_setOptField ( hEdma, chId, EDMA3_DRV_OPT_FIELD_ITCINTEN, 1u )
                               != EDMA3_DRV_SOK ) {
fail = -1;
goto func_return;
}

trFinished = 0;
ts = C64P_getltime();
if ( EDMA3_DRV_enableTransfer ( hEdma, chId, EDMA3_DRV_TRIG_MODE_MANUAL )
!= EDMA3_DRV_SOK ) {
fail = -2;
goto func_return;
}
while ( trFinished == 0 ); // wait for transferring Y to finish
endtime = CLK_getltime();
te = C64P_getltime();
LOG_printf(&trace,"edma3 costs time =%d",endtime-startime);
LOG_printf( &trace, " Tran(Y) cycles = %u", te - ts );

// checking correctness
if ( check_2D2D( src, dst ) ) {
fail = -3;
goto func_return;
}

if ( evtMiss ) { // is there any event miss?
    fail = -2;
    goto func_return;
}
//==================================================
    startime = CLK_getltime();

   memcpy(dst,src, IMG_WIDTH*2*IMG_HEIGHT);
endtime = CLK_getltime();
LOG_printf(&trace,"memcpy costs time =%d",endtime-startime);
//==================================================
   return (1);
func_return:
/* if ( chId != EDMA3_DRV_DMA_CHANNEL_ANY ) {
     EDMA3_DRV_freeChannel ( hEdma, chId );
}
*/
return fail;
}

over 13 years ago

0 Prasad Konnur over 13 years ago

TI__Expert 6065 points

Hi,

Are you still facing the issue you mentioned?

In memcpy you are passing the dst and src buffs for data transfer which are cacheble area hence the chche flush is required for the writeback to the DDR locations. And the EDMA hardware will write to the DDR locations directly you might be seeing this difference.

Regarding the EDMA programming you are transferring acnt * bcnt bytes per transfer trigger since your acnt is 1, and you need ccnt number of triggers i am not sure from the code attached if you are seeing the full transfer of acnt * bcnt * ccnt bytes. maybe you can rearrange the PaRAMs making ccnt as 1 and transfer whole data at once, since for smaller data EDMALLD overhead will be more hence you might get slightly better performance for EDMA transfers if you transfer at once.

Also mention on which platform you are running and the EDMA3LLD and BIOS versions.

Regards,
Prasad

0 RCReddy over 13 years ago in reply to Prasad Konnur

Genius 3575 points

Hi,

I am novice in this EDMA. Just wanted to know if you are doing

"data transfer from one chip DDR to another chip DDR"

"data transfer from one chip DDR to same chip DDR" [If so, why is EDMA required, can't i relocate the contents in memory directly..i mean define a array/structure and pushing it to a a certain memory location say xyz[0] is sitting at 0x80000024 and pqr[0] is sitting at 0x80000056 and then do copy directly, is there any advantage i get with edma]

can you give more details on what you are trying to do [i am learning very much here,]

Thanks

RC Reddy

0 RCReddy over 13 years ago in reply to RCReddy

Genius 3575 points

I understand that memcpy involves processor and takes more cycles or time. is there any comparison of throughput of the memcpy v/s EDMA in any Ti forums [is that kind of comparison apt ?].

Thanks

RC Reddy

Processors

Processors forum

EDMA cost more time than memcpy()