Tool/software:
Hi Team,
How can I optimize DSPLIB_add_example.cpp code to reach the performance levels mentioned in the datasheet summary?
This thread has been locked.
If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.
Tool/software:
Hi Team,
How can I optimize DSPLIB_add_example.cpp code to reach the performance levels mentioned in the datasheet summary?
For this test, we are using the DSPLIB_add kernel example from "ti-processor-sdk-rtos-j784s4-evm-11_00_00_06\dsplib\examples\DSPLIB_add". The objective is to enhance the performance of the standalone code by enabling cache.
The example code is given below: #include "dsplib.h"
#include <stdint.h>
int main(void)
{
float in0[] = {0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1,
0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1
};
float in1[] = {0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0
};
float out[256] = {0.0};
uint32_t size = 256;
// handles and struct for call to kernel
DSPLIB_STATUS status;
DSPLIB_add_InitArgs kerInitArgs;
int32_t handleSize = DSPLIB_add_getHandleSize(&kerInitArgs);
DSPLIB_kernelHandle handle = malloc(handleSize);
DSPLIB_bufParams1D_t bufParamsIn, bufParamsOut;
// fill in input and output buffer parameters
bufParamsIn.data_type = DSPLIB_FLOAT32;
bufParamsIn.dim_x = size;
bufParamsOut.data_type = DSPLIB_FLOAT32;
bufParamsOut.dim_x = size;
kerInitArgs.dataSize = size;
kerInitArgs.funcStyle = DSPLIB_FUNCTION_OPTIMIZED;
status = DSPLIB_SUCCESS;
// init checkparams
// if (status == DSPLIB_SUCCESS)
// status = DSPLIB_add_init_checkParams(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
// init
if (status == DSPLIB_SUCCESS)
status = DSPLIB_add_init(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
#if 1//RDP
uint32_t k;
// get output to L1D
int16_t outSum = 0;
int8_t *pOutTemp = (int8_t *) out; // treat output as bytes to be data type agnostic
int8_t *pInTemp0 = (int8_t *) in0; // treat output as bytes to be data type agnostic
int8_t *pInTemp1 = (int8_t *) in1; // treat output as bytes to be data type agnostic
for (k = 0; k < size; k++) {
outSum += *pOutTemp + *pInTemp0 + *pInTemp1;
pOutTemp++;
pInTemp0++;
pInTemp1++;
}
status = DSPLIB_SUCCESS;
#endif
#if 1//RDP
/*RDP********** Adding the below to try to get input/output into L1D *************
* This is loosely based on how DSPLIB test code does this
*/
uint32_t j;
/* The following for loop is to call kernel repeatedly so as to
* train the branch predictor */
for (j = 0; j < (10); j++) {
// run warm instruction cache test
status = DSPLIB_add_exec(handle, in0, in1, out);
}
#endif
// exec checkparams
// if (status == DSPLIB_SUCCESS)
// DSPLIB_add_exec_checkParams(handle, in0, in1, out);
unsigned long start_time, stop_time;
// exec
if (status == DSPLIB_SUCCESS) {
start_time = __TSC;
status = DSPLIB_add_exec(handle, in0, in1, out);
stop_time = __TSC;
}
printf("\nNumber of clock cycles elapsed in %lu\n", stop_time - start_time);
// print results
size_t c;
for ( c = 0; c < size; c++) {
printf("%10g + %10g = %10g\n", in0[c], in1[c], out[c]);
}
return 0;
}
Output:
When TSC register is used inside the code to measure the cycle counts, the following counts were observed from each cores:
DSP Core |
C71x_0 |
C71x_1 |
C71x_2 |
C71x_3 |
Cycle Count |
368 |
368 |
368 |
368 |
These are the expected cycle counts from the dsplib userguide performance summary:
Observation:
The cycle count has increased by 3.64 times compared to the expected value from the performance report.
Reason:
An increase in cycle counts is observed in the standalone code compared to the test code, as cache is enabled in the test code, which helps reduce cycle counts.
Solution:
The solution is enabling the cache and mmu within the standalone example code.
Steps to enable cache within standalone code:
#include "dsplib.h" #include <stdint.h> #include <enable_cache_mmu.h> #include <invalidate_tlb.h> #include <DSPLIB_TEST_c7xecr.h> #if !defined(_HOST_BUILD) extern const uint64_t pte_lvl0[512]; #endif int main(void) { #if __C7X_VEC_SIZE_BITS__ == 512 // Known silicon bug, errata: TBD __sa_set_cr(0, __sa_get_cr(1)); #endif #if !defined(_HOST_BUILD) // enable MMU enable_cache_mmu((uint64_t) pte_lvl0); // invalidate TLB invalidate_tlb(); #endif float in0[] = {0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1, 0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99, 0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542, 0.97309674, 0.79839982, 0.06691247, 1, 0.1 }; float in1[] = {0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0., 0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0 }; float out[256] = {0.0}; uint32_t size = 256; // handles and struct for call to kernel DSPLIB_STATUS status; DSPLIB_add_InitArgs kerInitArgs; int32_t handleSize = DSPLIB_add_getHandleSize(&kerInitArgs); DSPLIB_kernelHandle handle = malloc(handleSize); DSPLIB_bufParams1D_t bufParamsIn, bufParamsOut; // fill in input and output buffer parameters bufParamsIn.data_type = DSPLIB_FLOAT32; bufParamsIn.dim_x = size; bufParamsOut.data_type = DSPLIB_FLOAT32; bufParamsOut.dim_x = size; kerInitArgs.dataSize = size; kerInitArgs.funcStyle = DSPLIB_FUNCTION_OPTIMIZED; status = DSPLIB_SUCCESS; // init checkparams // if (status == DSPLIB_SUCCESS) // status = DSPLIB_add_init_checkParams(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs); // init if (status == DSPLIB_SUCCESS) status = DSPLIB_add_init(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs); #if 1//RDP uint32_t k; // get output to L1D int16_t outSum = 0; int8_t *pOutTemp = (int8_t *) out; // treat output as bytes to be data type agnostic int8_t *pInTemp0 = (int8_t *) in0; // treat output as bytes to be data type agnostic int8_t *pInTemp1 = (int8_t *) in1; // treat output as bytes to be data type agnostic for (k = 0; k < size; k++) { outSum += *pOutTemp + *pInTemp0 + *pInTemp1; pOutTemp++; pInTemp0++; pInTemp1++; } status = DSPLIB_SUCCESS; #endif #if 1//RDP /*RDP********** Adding the below to try to get input/output into L1D ************* * This is loosely based on how DSPLIB test code does this */ uint32_t j; /* The following for loop is to call kernel repeatedly so as to * train the branch predictor */ for (j = 0; j < (10); j++) { // run warm instruction cache test status = DSPLIB_add_exec(handle, in0, in1, out); } #endif // exec checkparams // if (status == DSPLIB_SUCCESS) // DSPLIB_add_exec_checkParams(handle, in0, in1, out); unsigned long start_time, stop_time; // exec if (status == DSPLIB_SUCCESS) { start_time = __TSC; status = DSPLIB_add_exec(handle, in0, in1, out); stop_time = __TSC; } printf("\nNumber of clock cycles elapsed in %lu\n", stop_time - start_time); // print results size_t c; for ( c = 0; c < size; c++) { printf("%10g + %10g = %10g\n", in0[c], in1[c], out[c]); } return 0; }
Updated Performance:
DSP Core |
C71x_0 |
C71x_1 |
C71x_2 |
C71x_3 |
Cycle Count |
141 |
141 |
141 |
141 |
Observation:
There is a 2.6 times decrease in cycle counts when cache and mmu is enable within the code and cycle counts are close to the reference count mentioned with the DSPLIB performance summary.
Regards,