Part Number: PROCESSOR-SDK-J784S4
Tool/software:
Hi experts,
I am using j784s4_evm with SDK 10.
I am using main r5f cluster 0 core 0 with its BTCM memory (from local address space).
But the performance are low ~130MB/s.
I setup a performance test with different size which copy data from DDR to BTCM.
/*######################################################
* DEFINE SECTION
* ----------------------------------------------------*/
#define APP_NB_TEST (6U)
#define APP_NB_ITERATION (100U)
#define APP_MAX_SIZE_BYTE (30*1024)
#define APP_WORD_SIZE (sizeof(uint32_t))
#define APP_MAX_SIZE_WORD (APP_MAX_SIZE_BYTE / APP_WORD_SIZE)
#define DDR_DATA_BUFFER_ADDR (0xAC000000)
/*######################################################
* STRUCTURE
* ----------------------------------------------------*/
typedef struct
{
uint32_t size;
volatile uint64_t time;
}test_desc_t;
/*######################################################
* GLOBAL VARIABLES
* ----------------------------------------------------*/
test_desc_t test_desc[APP_NB_TEST] =
{
{10, 0},
{40, 0},
{100, 0},
{1024, 0},
{10*1024, 0},
{30*1024, 0}
};
static uint32_t gSrcBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".ddr_data")));
static uint32_t gDstBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".tcmb_data")));
volatile uint64_t gStartTime = 0;
/*######################################################
* FUNCTIONS
* ----------------------------------------------------*/
/**
* \brief This is the Task function that test tcm performance
* The tast is created in main if "__TEST_TCM_ENABLE__" is defined
*
* \param
*
* \retval
*/
void vTaskTestTCM(void *pvParameters)
{
bool run = true;
uint32_t i_test = 0;
uint32_t i_iteration = 0;
uint32_t i_data = 0;
uint32_t* srcMemory = &gSrcBuffer[0];
uint32_t* dstMemory = &gDstBuffer[0];
float time_per_iteration = 0;
float speed = 0;
PRINT_LOG("\r\n********************************************\r\n");
PRINT_LOG("\r\n[FreeRTOS] TCM tests ... start !!!\r\n");
PRINT_LOG("\r\n********************************************\r\n");
PRINT_LOG("src buffer @0x%x\n", (uint32_t) srcMemory);
PRINT_LOG("dst buffer @0x%x\n", (uint32_t) dstMemory);
for(i_data = 0; i_data < APP_MAX_SIZE_WORD; i_data++)
{
gSrcBuffer[i_data] = i_data;
}
CacheP_wb(gSrcBuffer, APP_MAX_SIZE_BYTE);
for(i_test = 0; i_test < APP_NB_TEST; i_test++)
{
for(i_iteration = 0; i_iteration < APP_NB_ITERATION; i_iteration++)
{
memset(dstMemory, 0, APP_MAX_SIZE_BYTE);
gStartTime = portGET_RUN_TIME_COUNTER_VALUE();
CacheP_Inv(srcMemory, test_desc[i_test].size);
memcpy(dstMemory, srcMemory, test_desc[i_test].size);
CacheP_wb(dstMemory, test_desc[i_test].size);
test_desc[i_test].time += portGET_RUN_TIME_COUNTER_VALUE() - gStartTime;
}
PRINT_LOG("\n ------------- \n\n");
PRINT_LOG("Time to cpy data from DDR to BTCM: \n");
PRINT_LOG(" data size = %d Bytes\n", test_desc[i_test].size);
PRINT_LOG(" total time = %llu us\n", test_desc[i_test].time);
time_per_iteration = (float) test_desc[i_test].time / (float) APP_NB_ITERATION;
speed = (float) test_desc[i_test].size / time_per_iteration;
PRINT_LOG(" 1 cpy = %.2f us\n", time_per_iteration);
PRINT_LOG(" speed = %.2f MB/s\n", speed);
}
while(run)
{
BOARD_delay(1000000);
}
vTaskDelete( NULL );
}
This is the output:
[FreeRTOS] TCM test is starting !!!
[FreeRTOS] creating tasks ...
********************************************
[FreeRTOS] TCM tests ... start !!!
********************************************
src buffer @0xa242b600
dst buffer @0x41010080
-------------
Time to cpy data from DDR to BTCM:
data size = 10 Bytes
total time = 66 us
1 cpy = 0.66 us
speed = 15.15 MB/s
-------------
Time to cpy data from DDR to BTCM:
data size = 40 Bytes
total time = 91 us
1 cpy = 0.91 us
speed = 43.96 MB/s
-------------
Time to cpy data from DDR to BTCM:
data size = 100 Bytes
total time = 149 us
1 cpy = 1.49 us
speed = 67.11 MB/s
-------------
Time to cpy data from DDR to BTCM:
data size = 1024 Bytes
total time = 859 us
1 cpy = 8.59 us
speed = 119.21 MB/s
-------------
Time to cpy data from DDR to BTCM:
data size = 10240 Bytes
total time = 7824 us
1 cpy = 78.24 us
speed = 130.88 MB/s
-------------
Time to cpy data from DDR to BTCM:
data size = 30720 Bytes
total time = 23495 us
1 cpy = 234.95 us
speed = 130.75 MB/s
I found in this post :
they use address space like me and they got a better performance
Thanks for your help
Charles