PROCESSOR-SDK-J784S4: TCM performance

Part Number: PROCESSOR-SDK-J784S4

Tool/software:

Hi experts,

I am using j784s4_evm with SDK 10.

I am using main r5f cluster 0 core 0 with its BTCM memory (from local address space). 

But the performance are low ~130MB/s. 

I setup a performance test with different size which copy data from DDR to BTCM. 

/*######################################################
 *  DEFINE SECTION
 *  ----------------------------------------------------*/

#define APP_NB_TEST                     (6U)
#define APP_NB_ITERATION                (100U)
#define APP_MAX_SIZE_BYTE               (30*1024)
#define APP_WORD_SIZE                   (sizeof(uint32_t))
#define APP_MAX_SIZE_WORD               (APP_MAX_SIZE_BYTE / APP_WORD_SIZE)

#define DDR_DATA_BUFFER_ADDR      (0xAC000000)

/*######################################################
 *  STRUCTURE
 *  ----------------------------------------------------*/

typedef struct 
{
  uint32_t size;
  volatile uint64_t time;

}test_desc_t;

/*######################################################
 *  GLOBAL VARIABLES
 *  ----------------------------------------------------*/

test_desc_t test_desc[APP_NB_TEST] = 
{
    {10, 0},
    {40, 0},
    {100, 0},
    {1024, 0},
    {10*1024, 0},
    {30*1024, 0}
};

static uint32_t gSrcBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".ddr_data")));
static uint32_t gDstBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".tcmb_data")));
volatile uint64_t gStartTime = 0;


/*######################################################
 *  FUNCTIONS
 *  ----------------------------------------------------*/


/**
 * \brief   This is the Task function that test tcm performance
 *          The tast is created in main if "__TEST_TCM_ENABLE__" is defined
 *
 * \param   
 *
 * \retval  
 */
void vTaskTestTCM(void *pvParameters)
{   
    bool run = true;
    uint32_t i_test = 0;
    uint32_t i_iteration = 0;
    uint32_t i_data = 0;
    uint32_t* srcMemory = &gSrcBuffer[0];
    uint32_t* dstMemory = &gDstBuffer[0];
    float time_per_iteration = 0; 
    float speed = 0; 

    PRINT_LOG("\r\n********************************************\r\n");
    PRINT_LOG("\r\n[FreeRTOS] TCM tests ... start !!!\r\n");
    PRINT_LOG("\r\n********************************************\r\n");

    PRINT_LOG("src buffer @0x%x\n", (uint32_t) srcMemory);
    PRINT_LOG("dst buffer @0x%x\n", (uint32_t) dstMemory);

    for(i_data = 0; i_data < APP_MAX_SIZE_WORD; i_data++)
    {
        gSrcBuffer[i_data] = i_data;
    }
    CacheP_wb(gSrcBuffer, APP_MAX_SIZE_BYTE);

    for(i_test = 0; i_test < APP_NB_TEST; i_test++)
    {
        
        for(i_iteration = 0; i_iteration < APP_NB_ITERATION; i_iteration++)
        {
            memset(dstMemory, 0, APP_MAX_SIZE_BYTE);

            gStartTime = portGET_RUN_TIME_COUNTER_VALUE();
            CacheP_Inv(srcMemory, test_desc[i_test].size);
            memcpy(dstMemory, srcMemory, test_desc[i_test].size);
            CacheP_wb(dstMemory, test_desc[i_test].size);
            test_desc[i_test].time += portGET_RUN_TIME_COUNTER_VALUE() - gStartTime;
        }

        PRINT_LOG("\n ------------- \n\n");
        PRINT_LOG("Time to cpy data from DDR to BTCM: \n");
        PRINT_LOG("     data size  = %d Bytes\n", test_desc[i_test].size);
        PRINT_LOG("     total time = %llu us\n", test_desc[i_test].time);
        
        time_per_iteration = (float) test_desc[i_test].time / (float) APP_NB_ITERATION;
        speed = (float) test_desc[i_test].size / time_per_iteration;
        
        PRINT_LOG("     1 cpy     = %.2f us\n", time_per_iteration);
        PRINT_LOG("     speed     = %.2f MB/s\n", speed);
    }

    while(run)
    {
        BOARD_delay(1000000);
    }
    
    vTaskDelete( NULL );
}

This is the output: 

[FreeRTOS] TCM test is starting !!!
[FreeRTOS] creating tasks ...

********************************************

[FreeRTOS] TCM tests ... start !!!

********************************************
src buffer @0xa242b600
dst buffer @0x41010080

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 10 Bytes
     total time = 66 us
     1 cpy     = 0.66 us
     speed     = 15.15 MB/s

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 40 Bytes
     total time = 91 us
     1 cpy     = 0.91 us
     speed     = 43.96 MB/s

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 100 Bytes
     total time = 149 us
     1 cpy     = 1.49 us
     speed     = 67.11 MB/s

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 1024 Bytes
     total time = 859 us
     1 cpy     = 8.59 us
     speed     = 119.21 MB/s

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 10240 Bytes
     total time = 7824 us
     1 cpy     = 78.24 us
     speed     = 130.88 MB/s

 -------------

Time to cpy data from DDR to BTCM:
     data size  = 30720 Bytes
     total time = 23495 us
     1 cpy     = 234.95 us
     speed     = 130.75 MB/s

I found in this post :

AM6442: The memcpy performance between R5 TCM is inferior to that between DDR or DDR and TCM - Processors forum - Processors - TI E2E support forums

they use address space like me and they got a better performance

Thanks for your help

Charles