This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

AM3358: DDR3 reading is slower than writing

Genius 13655 points
Part Number: AM3358


Hello Champs,

Hardware: AM3352,SPI0.SPIFLASH,DDR3@400MHz/800MTS;

Software: CCS; AM335X_StarterWare; SPL loads APP project to DDR

Enable MMU and CACHE(including L2)

REGION regionDdr = {
MMU_PGTYPE_SECTION, 
START_ADDR_DDR, 
NUM_SECTIONS_DDR,
MMU_MEMTYPE_NORMAL_NON_SHAREABLE(MMU_CACHE_WT_NOWA, MMU_CACHE_WB_WA),
MMU_REGION_NON_SECURE, 
MMU_AP_PRV_RW_USR_RW,
(unsigned int*)pageTable
};

Test Result: reading DDR is much slower than writing DDR;

 

32bit Write Test:

32bit 正序Write 160 MB,use 126 Ms; Spd = 1269 MPS. SUM=0xFEC00000

32bit 倒序Write 160 MB,use 164 Ms; Spd = 975 MPS. SUM=0x1400000

 

32bit read Test:

32bit read 160 MB,use 231 Ms; Spd = 692 MPS. SUM=0xFEC00000

32bit  read 160 MB,use 933 Ms; Spd = 171 MPS. SUM=0x1400000

 

8bit Write Test:

8bit Write 160 MB,use 785 Ms; Spd = 203 MPS. SUM=0x5000000

8bit  Write 160 MB,use 811 Ms; Spd = 197 MPS. SUM=0xFB000000

 

8bit Read Test:

8bit Read 160 MB,use 1792 Ms; Spd = 89 MPS. SUM=0xFB000000

8bit Read 160 MB,use 1089 Ms; Spd = 146 MPS. SUM=0xFB000000

Test code:

void Ddr3SpdTst(unsigned int  iStartAddr, unsigned int iSizeBytes )

{

    unsigned int i,j,tSize, tBgnMs,tEndMs,tMB,tSum;

    unsigned int *p32Dst;

    unsigned char *p8Dst;

//--------------------------------------------------------   

    UartPrintf("\n32bit Write Test:\n");

//------------ 32bit Write 

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++)

    {

             p32Dst  = (unsigned int *)iStartAddr;

             tSize = iSizeBytes/4;

             for(i=0;i<tSize;i++)

             {

                   p32Dst[i]=i; 

                   tSum+=i;

                   tMB+=4;

             }

    }

    tEndMs = gGlobalT12; /

    tMB = tMB /(1024U * 1024U);

    UartPrintf("32bit 正序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);

 

//------------

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++) 

    {

             p32Dst  = (unsigned int *)iStartAddr;

             tSize = iSizeBytes/4;

             for(i=tSize;i!=0;i--)

             {

                   p32Dst[i]=i; 

                   tSum+=i;

                   tMB+=4;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

   

    UartPrintf("32bit 倒序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);

 

    UartPrintf("\n32bit read Test:\n");

 

//------------

    tBgnMs = gGlobalT12;

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++)

    {

             p32Dst  = (unsigned int *)iStartAddr;

             tSize = iSizeBytes/4;

             for(i=0;i<tSize;i++)

             {

                   tSum +=p32Dst[i]; // 正序读;

                   tMB+=4;

             }

    }

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

    UartPrintf("32bit 正序read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );

 

//------------

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++) 

    {

             p32Dst  = (unsigned int *)iStartAddr;

             tSize = iSizeBytes/4;

             for(i=tSize;i!=0;i--)

             {

                   tSum +=p32Dst[i]; 

                   tMB+=4;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

 

    UartPrintf("32bit 倒序read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );

 

 

 

 

 

 

//--------------------------------------------------------   

    UartPrintf("\n8bit Write Test:\n");

//------------

   

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++)

    {

             p8Dst  = (unsigned char *)iStartAddr;

             tSize = iSizeBytes;

             for(i=tSize;i!=0;i--)

             {

                   p8Dst[i]=(unsigned char)i;

                   tSum+=i;

                   tMB+=1;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

   

    UartPrintf("8bit 倒序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);

 

//------------

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++) 

    {

             p8Dst  = (unsigned char *)iStartAddr;

             tSize = iSizeBytes;

             for(i=0;i<tSize;i++)

             {

                   p8Dst[i]=(unsigned char)i; 

                   tSum+=i;

                   tMB+=1;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

   

    UartPrintf("8bit 正序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);

 

 

 

 

 

 

 

 

//------------

 

    UartPrintf("\n8bit Read Test:\n");

   

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++) 

    {

             p8Dst  = (unsigned char *)iStartAddr;

             tSize = iSizeBytes;

             for(i=tSize;i!=0;i--)

             {

                   tSum+=p8Dst[i];

                   tMB+=1;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

   

    UartPrintf("8bit 倒序Read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);

 

 

//------------

    tBgnMs = gGlobalT12; 

    tSum=0;

    tMB=0;

    for(j=0;j<10;j++) 

    {

             p8Dst  = (unsigned char *)iStartAddr;

             tSize = iSizeBytes;

             for(i=0;i<tSize;i++)

             {

                   tSum+=p8Dst[i]; 

                   tMB+=1;

             }

    }

 

    tEndMs = gGlobalT12; 

    tMB = tMB/(1024U * 1024U);

   

    UartPrintf("8bit 正序Read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );

 
Thanks.

Rgds

Shine

  • Shine, it looks like your inner cache policy is WriteThrough/NoWriteAllocate while your outer cache policy is WriteBack/WriteAllocate. I believe what may be happening is that you are writing a block to memory, and then reading that same block, but the reads are resulting in cache misses. Can you try setting both inner and outer to WB_WA?

    Also, generally speaking, single cycle reads from the processor (which is what your code is doing) are always going to be slower than writes. You can increase the performance using DMAs.

    Regards,
    James
  • hi JJD,

    Thank you for reply;

    After modifying the CACHE configuration(MMU_MEMTYPE_NORMAL_NON_SHAREABLE(MMU_CACHE_WB_WA,  MMU_CACHE_WB_WA)), the test results were almost unchanged!

    DMA is not suitable because it does not read fixed data (image recognition).

    PS:  A search for E2E found a similar problem with this link, but no results.  http://e2e.ti.com/support/processors/f/791/p/187926/707626   

  • I'm not sure what the problem is. Starterware may not be setting the MMU or cache correctly.

    BTW, Starterware is no longer supported. It is recommended to move to TI RTOS. See the following link for more info. www.ti.com/.../STARTERWARE-SITARA

    I would see if you can run the same application under the latest TI RTOS and report the results. I will have a better chance of getting support from the software team

    Regards,
    James