This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

[FAQ] TDA4VH-Q1: How to enable cache and mmu in a standalone c7x code to maximize performance

Part Number: TDA4VH-Q1

Tool/software:

Hi Team,

How can I optimize  DSPLIB_add_example.cpp code to reach the performance levels mentioned in the datasheet summary?

  • For this test, we are using the DSPLIB_add kernel example from "ti-processor-sdk-rtos-j784s4-evm-11_00_00_06\dsplib\examples\DSPLIB_add". The objective is to enhance the performance of the standalone code by enabling cache. 

    The example code is given below: 

    #include "dsplib.h"
    #include <stdint.h>
    int main(void)
    {
    
       float in0[] = {0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                      0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                      0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1
       };
    
       float in1[] = {0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                      0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                      5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0
       };
    
       float out[256] = {0.0};
       uint32_t size = 256;
    
       // handles and struct for call to kernel
       DSPLIB_STATUS       status;
       DSPLIB_add_InitArgs kerInitArgs;
       int32_t             handleSize = DSPLIB_add_getHandleSize(&kerInitArgs);
       DSPLIB_kernelHandle handle     = malloc(handleSize);
    
       DSPLIB_bufParams1D_t bufParamsIn, bufParamsOut;
    
       // fill in input and output buffer parameters
       bufParamsIn.data_type = DSPLIB_FLOAT32;
       bufParamsIn.dim_x     = size;
    
       bufParamsOut.data_type = DSPLIB_FLOAT32;
       bufParamsOut.dim_x     = size;
    
       kerInitArgs.dataSize  = size;
       kerInitArgs.funcStyle = DSPLIB_FUNCTION_OPTIMIZED;
    
       status = DSPLIB_SUCCESS;
    
       // init checkparams
       // if (status == DSPLIB_SUCCESS)
       //    status = DSPLIB_add_init_checkParams(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
    
       // init
       if (status == DSPLIB_SUCCESS)
          status = DSPLIB_add_init(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
    
    #if 1//RDP
        uint32_t k;
        // get output to L1D
       int16_t outSum   = 0;
       int8_t *pOutTemp = (int8_t *) out; // treat output as bytes to be data type agnostic
       int8_t *pInTemp0 = (int8_t *) in0; // treat output as bytes to be data type agnostic
       int8_t *pInTemp1 = (int8_t *) in1; // treat output as bytes to be data type agnostic
    
       for (k = 0; k < size; k++) {
          outSum += *pOutTemp + *pInTemp0 + *pInTemp1;
          pOutTemp++;
          pInTemp0++;
          pInTemp1++;
       }
    
       status = DSPLIB_SUCCESS;
    #endif
    
    #if 1//RDP
       /*RDP********** Adding the below to try to get input/output into L1D *************
        * This is loosely based on how DSPLIB test code does this
        */
       uint32_t j;
       /* The following for loop is to call kernel repeatedly so as to
         * train the branch predictor                                   */
        for (j = 0; j < (10); j++) {
           // run warm instruction cache test
           status = DSPLIB_add_exec(handle, in0, in1, out);
        }
    #endif
    
       // exec checkparams
       // if (status == DSPLIB_SUCCESS)
       //    DSPLIB_add_exec_checkParams(handle, in0, in1, out);
    
       unsigned long start_time, stop_time;
       // exec
       if (status == DSPLIB_SUCCESS) {
          start_time = __TSC;
          status = DSPLIB_add_exec(handle, in0, in1, out);
          stop_time = __TSC;
       }
       printf("\nNumber of clock cycles elapsed in %lu\n", stop_time - start_time);
    
       // print results
       size_t c;
       for ( c = 0; c < size; c++) {
          printf("%10g + %10g = %10g\n", in0[c], in1[c], out[c]);
       }
    
       return 0;
    }
    


    Output:

    When TSC register is used inside the code to measure the cycle counts, the following counts were observed from each cores: 

     DSP Core          

    C71x_0       

    C71x_1    

    C71x_2      

    C71x_3 

    Cycle Count       

    368 

    368 

    368 

    368 

     

    These are the expected cycle counts from the dsplib userguide performance summary: 



    Observation: 

    The cycle count has increased by 3.64 times compared to the expected value from the performance report.  

    Reason:  

    An increase in cycle counts is observed in the standalone code compared to the test code, as cache is enabled in the test code, which helps reduce cycle counts. 

    Solution: 

    The solution is enabling the cache and mmu within the standalone example code. 

    Steps to enable cache within standalone code: 

    1. Add the following lines from dsplib/test/common/c71/DSPLIB_TEST_init.c to the standalone code. 


    2. Include the  files below to the folder containing main  file from dsplib\test\common\c71: 
                   c7x_simple_l1_l2_msmc_ddr_ptc.c 
                   DSPLIB_TEST_c7xecr.{h,asm}  
                   enable_cache_mmu.{h,c} 
                   invalidate_tlb.{h,c} 

      Updated Code to enable cache
      :
       
      #include "dsplib.h"
      #include <stdint.h>
      #include <enable_cache_mmu.h>
      #include <invalidate_tlb.h>
      #include <DSPLIB_TEST_c7xecr.h>
      #if !defined(_HOST_BUILD)
      extern const uint64_t pte_lvl0[512];
      #endif
      
      int main(void)
      {
      #if __C7X_VEC_SIZE_BITS__ == 512
         // Known silicon bug, errata: TBD
         __sa_set_cr(0, __sa_get_cr(1));
      #endif
      
      #if !defined(_HOST_BUILD)
         // enable MMU
         enable_cache_mmu((uint64_t) pte_lvl0);
      
         // invalidate TLB
         invalidate_tlb();
      #endif
      
         float in0[] = {0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1,
                        0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0,0.71649936, 0.13543484, 0.50923542, 0.54119591, 0.19242506, 0.38308575, 0.56363197, 0, 0.99,
                        0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1,0.24567145, 0.05629663, 0.99152828, 0.4799542,  0.97309674, 0.79839982, 0.06691247, 1, 0.1
         };
      
         float in1[] = {0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0,
                        0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 4.71238898, 1,
                        5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0, 5.49778714, 0.,         0.78539816, 1.57079633, 2.35619449, 3.14159265, 3.92699082, 0
         };
      
         float out[256] = {0.0};
         uint32_t size = 256;
      
         // handles and struct for call to kernel
         DSPLIB_STATUS       status;
         DSPLIB_add_InitArgs kerInitArgs;
         int32_t             handleSize = DSPLIB_add_getHandleSize(&kerInitArgs);
         DSPLIB_kernelHandle handle     = malloc(handleSize);
      
         DSPLIB_bufParams1D_t bufParamsIn, bufParamsOut;
      
         // fill in input and output buffer parameters
         bufParamsIn.data_type = DSPLIB_FLOAT32;
         bufParamsIn.dim_x     = size;
      
         bufParamsOut.data_type = DSPLIB_FLOAT32;
         bufParamsOut.dim_x     = size;
      
         kerInitArgs.dataSize  = size;
         kerInitArgs.funcStyle = DSPLIB_FUNCTION_OPTIMIZED;
      
         status = DSPLIB_SUCCESS;
      
         // init checkparams
         // if (status == DSPLIB_SUCCESS)
         //    status = DSPLIB_add_init_checkParams(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
      
         // init
         if (status == DSPLIB_SUCCESS)
            status = DSPLIB_add_init(handle, &bufParamsIn, &bufParamsOut, &kerInitArgs);
      
      #if 1//RDP
          uint32_t k;
          // get output to L1D
         int16_t outSum   = 0;
         int8_t *pOutTemp = (int8_t *) out; // treat output as bytes to be data type agnostic
         int8_t *pInTemp0 = (int8_t *) in0; // treat output as bytes to be data type agnostic
         int8_t *pInTemp1 = (int8_t *) in1; // treat output as bytes to be data type agnostic
      
         for (k = 0; k < size; k++) {
            outSum += *pOutTemp + *pInTemp0 + *pInTemp1;
            pOutTemp++;
            pInTemp0++;
            pInTemp1++;
         }
      
         status = DSPLIB_SUCCESS;
      #endif
      
      #if 1//RDP
         /*RDP********** Adding the below to try to get input/output into L1D *************
          * This is loosely based on how DSPLIB test code does this
          */
         uint32_t j;
         /* The following for loop is to call kernel repeatedly so as to
           * train the branch predictor                                   */
          for (j = 0; j < (10); j++) {
             // run warm instruction cache test
             status = DSPLIB_add_exec(handle, in0, in1, out);
          }
      #endif
      
         // exec checkparams
         // if (status == DSPLIB_SUCCESS)
         //    DSPLIB_add_exec_checkParams(handle, in0, in1, out);
      
         unsigned long start_time, stop_time;
         // exec
         if (status == DSPLIB_SUCCESS) {
            start_time = __TSC;
            status = DSPLIB_add_exec(handle, in0, in1, out);
            stop_time = __TSC;
         }
         printf("\nNumber of clock cycles elapsed in %lu\n", stop_time - start_time);
      
         // print results
         size_t c;
         for ( c = 0; c < size; c++) {
            printf("%10g + %10g = %10g\n", in0[c], in1[c], out[c]);
         }
      
         return 0;
      }
      



      Output: 


      Updated Performance: 
       

      DSP Core          

      C71x_0       

      C71x_1    

      C71x_2      

      C71x_3 

      Cycle Count       

      141 

      141 

      141 

      141 

       

      Observation: 

      There is a 2.6 times decrease in cycle counts when cache and mmu is enable within the code and cycle counts are close to the reference count mentioned with the DSPLIB performance summary.

      Regards,
      Shabary S Sundar