Tool/software:
Hi experts,
I am using j784s4_evm with SDK 10.
I am using main r5f cluster 0 core 0 with its BTCM memory (from local address space).
But the performance are low ~130MB/s.
I setup a performance test with different size which copy data from DDR to BTCM.
/*###################################################### * DEFINE SECTION * ----------------------------------------------------*/ #define APP_NB_TEST (6U) #define APP_NB_ITERATION (100U) #define APP_MAX_SIZE_BYTE (30*1024) #define APP_WORD_SIZE (sizeof(uint32_t)) #define APP_MAX_SIZE_WORD (APP_MAX_SIZE_BYTE / APP_WORD_SIZE) #define DDR_DATA_BUFFER_ADDR (0xAC000000) /*###################################################### * STRUCTURE * ----------------------------------------------------*/ typedef struct { uint32_t size; volatile uint64_t time; }test_desc_t; /*###################################################### * GLOBAL VARIABLES * ----------------------------------------------------*/ test_desc_t test_desc[APP_NB_TEST] = { {10, 0}, {40, 0}, {100, 0}, {1024, 0}, {10*1024, 0}, {30*1024, 0} }; static uint32_t gSrcBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".ddr_data"))); static uint32_t gDstBuffer[APP_MAX_SIZE_WORD] __attribute__(( aligned(128), section(".tcmb_data"))); volatile uint64_t gStartTime = 0; /*###################################################### * FUNCTIONS * ----------------------------------------------------*/ /** * \brief This is the Task function that test tcm performance * The tast is created in main if "__TEST_TCM_ENABLE__" is defined * * \param * * \retval */ void vTaskTestTCM(void *pvParameters) { bool run = true; uint32_t i_test = 0; uint32_t i_iteration = 0; uint32_t i_data = 0; uint32_t* srcMemory = &gSrcBuffer[0]; uint32_t* dstMemory = &gDstBuffer[0]; float time_per_iteration = 0; float speed = 0; PRINT_LOG("\r\n********************************************\r\n"); PRINT_LOG("\r\n[FreeRTOS] TCM tests ... start !!!\r\n"); PRINT_LOG("\r\n********************************************\r\n"); PRINT_LOG("src buffer @0x%x\n", (uint32_t) srcMemory); PRINT_LOG("dst buffer @0x%x\n", (uint32_t) dstMemory); for(i_data = 0; i_data < APP_MAX_SIZE_WORD; i_data++) { gSrcBuffer[i_data] = i_data; } CacheP_wb(gSrcBuffer, APP_MAX_SIZE_BYTE); for(i_test = 0; i_test < APP_NB_TEST; i_test++) { for(i_iteration = 0; i_iteration < APP_NB_ITERATION; i_iteration++) { memset(dstMemory, 0, APP_MAX_SIZE_BYTE); gStartTime = portGET_RUN_TIME_COUNTER_VALUE(); CacheP_Inv(srcMemory, test_desc[i_test].size); memcpy(dstMemory, srcMemory, test_desc[i_test].size); CacheP_wb(dstMemory, test_desc[i_test].size); test_desc[i_test].time += portGET_RUN_TIME_COUNTER_VALUE() - gStartTime; } PRINT_LOG("\n ------------- \n\n"); PRINT_LOG("Time to cpy data from DDR to BTCM: \n"); PRINT_LOG(" data size = %d Bytes\n", test_desc[i_test].size); PRINT_LOG(" total time = %llu us\n", test_desc[i_test].time); time_per_iteration = (float) test_desc[i_test].time / (float) APP_NB_ITERATION; speed = (float) test_desc[i_test].size / time_per_iteration; PRINT_LOG(" 1 cpy = %.2f us\n", time_per_iteration); PRINT_LOG(" speed = %.2f MB/s\n", speed); } while(run) { BOARD_delay(1000000); } vTaskDelete( NULL ); }
This is the output:
[FreeRTOS] TCM test is starting !!! [FreeRTOS] creating tasks ... ******************************************** [FreeRTOS] TCM tests ... start !!! ******************************************** src buffer @0xa242b600 dst buffer @0x41010080 ------------- Time to cpy data from DDR to BTCM: data size = 10 Bytes total time = 66 us 1 cpy = 0.66 us speed = 15.15 MB/s ------------- Time to cpy data from DDR to BTCM: data size = 40 Bytes total time = 91 us 1 cpy = 0.91 us speed = 43.96 MB/s ------------- Time to cpy data from DDR to BTCM: data size = 100 Bytes total time = 149 us 1 cpy = 1.49 us speed = 67.11 MB/s ------------- Time to cpy data from DDR to BTCM: data size = 1024 Bytes total time = 859 us 1 cpy = 8.59 us speed = 119.21 MB/s ------------- Time to cpy data from DDR to BTCM: data size = 10240 Bytes total time = 7824 us 1 cpy = 78.24 us speed = 130.88 MB/s ------------- Time to cpy data from DDR to BTCM: data size = 30720 Bytes total time = 23495 us 1 cpy = 234.95 us speed = 130.75 MB/s
I found in this post :
they use address space like me and they got a better performance
Thanks for your help
Charles