This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

TMS320C6678: A performance benchmark on OpenMP throughput using MSMC vs DDR3 buffers. Why OMP parallel run is slower than serial execution??

Part Number: TMS320C6678

Hi All,

As probably some of you might have noticed, I am trying (also struggling) to gain better speed performance on my TMDSEVM6678LE  evaluation DSP module using OpenMP (I understand OMP stuff are on vacation right now. I will probably bring this thread to their attention later this month too).

Previously, I started a thread of conversion titled Compiler/TMDSEVM6678: What is the purpose of OpenMP Heap Management API? where I discussed slow runtime when I use OpenMP compared to serial mode. 

And, rrlagic kindly suggested to use MSMC RAM as apposed to DDR3 to gain better speed given the fact that MSMC is accessible by all threads simultaneously which actually was very helpful thanks to his valueable comments. 

Based on suggestions, I started creating a simple benchmark test to compare and contrast MSMC vs DDR3 with and without using OMP. In the below example, I created two same-size buffers one allocated from MSMC and the second from DDR3 heap using Memory_alloc(NULL, ...) . 

I do a simple math on each element of the two buffers with and without OMP. Below is the table of clock cycles took for each of the 4 combinations OMP/NOOMP DDR3/MSMC:

1) Obviously, MSMC did a great job to speed things up to 5 times for char, float, double, and long double when OMP is NOT used. I am just curious why only for those data types ??!! 

2) Why using OpenMP is still slower while MSMC is supposed to accessible by each core concurrently?

Please let me know what other stuff I could do potentially to benefit from OpenMP.

Feel free to cast doubt on the coding and criticizing it, I will be more than happy to hear your comments. 

I will probably bring this thread to this forum's attention later this month.

Regards,

Benchmark c code (whiteboard.c)

#include <stdio.h>
#include <c6x.h>
#include <math.h>
#include <omp.h>

#include <xdc/std.h>
#include <xdc/runtime/IHeap.h>
#include <xdc/runtime/System.h>
#include <xdc/runtime/Memory.h>
#include <xdc/runtime/Error.h>
#include <xdc/cfg/global.h>

typedef unsigned long long dt_uint64;
//typedef char buff_datatype;
//typedef short buff_datatype;
//typedef int buff_datatype;
typedef float buff_datatype;
//typedef double buff_datatype;
//typedef long double buff_datatype;


// buffer on MSMC
#pragma DATA_SECTION( msmc_buff, ".msmc_bufSec" )
#pragma DATA_ALIGN( msmc_buff, 128 )
#define BUFLEN  (40*1024)
buff_datatype msmc_buff[BUFLEN];

// buffer on DDR3 allcoated from heap
buff_datatype *ddr_buff;

// external msmc heap
extern const HeapBuf_Handle msmcHeap;
buff_datatype *msmc_heap;

int main(int argc, char* argv[]){
    printf("-= Program Started=- \n");

    TSCH = 0; TSCL = 0;
    dt_uint64 t_start, t_end;

    Error_Block eb;
    Error_init(&eb);
    int blockSize = 256;

    // NOOMP, Buffer on MSMC
    t_start = _itoll(TSCH, TSCL);
    for(int j = 0; j < BUFLEN; j++){
        msmc_buff[j] = 1+(msmc_buff[j]/2*3-50)/2*3-50;
    }
    t_end = _itoll(TSCH, TSCL);
    printf("NOOMP MSMC BUFF PROCESS TIME: %llu \n", t_end - t_start);

    t_start = _itoll(TSCH, TSCL);
    // OMP, Buffer on MSMC
#pragma omp parallel num_threads(8)
    {
#pragma omp for
        for(int j = 0; j < BUFLEN; j++){
//          putchar('M'); putchar('0' + omp_get_thread_num());
            msmc_buff[j] = 1+(msmc_buff[j]/2*3-50)/2*3-50;
        }
    }
      t_end = _itoll(TSCH, TSCL);
    printf("OMP MSMC BUFF PROCESS TIME: %llu \n", t_end - t_start);

    ddr_buff = Memory_alloc(NULL, BUFLEN*sizeof(buff_datatype), 128, &eb);

    if (!ddr_buff){
        printf("Cannot allocate ddr_buff from heap \n");
        exit(1);
    }

    // NO-OMP, Buffer on DDR
    t_start = _itoll(TSCH, TSCL);
        for(int j = 0; j < BUFLEN; j++){
            ddr_buff[j] = 1+(ddr_buff[j]/2*3-50)/2*3-50;
        }
    t_end = _itoll(TSCH, TSCL);
    printf("NOOMP DDR BUFF PROCESS TIME: %llu \n", t_end - t_start);

    // OMP, Buffer on DDR
    t_start = _itoll(TSCH, TSCL);
#pragma omp parallel num_threads(8)
    {
#pragma omp for
        for(int j = 0; j < BUFLEN; j++){
            ddr_buff[j] = 1+(ddr_buff[j]/2*3-50)/2*3-50;
        }
    }
    t_end = _itoll(TSCH, TSCL);
    printf("OMP DDR BUFF PROCESS TIME: %llu \n", t_end - t_start);



    Memory_free(NULL, ddr_buff, BUFLEN*sizeof(buff_datatype));

    printf("-= Program Ended=- \n");
}

 

omp_config.cfg

/***************************/
/* SECTION MAPPING         */
/***************************/
var program = xdc.useModule('xdc.cfg.Program');

program.sectMap[".args"]        = new Program.SectionSpec();
program.sectMap[".bss"]         = new Program.SectionSpec();
program.sectMap[".cinit"]       = new Program.SectionSpec();
program.sectMap[".cio"]         = new Program.SectionSpec();
program.sectMap[".const"]       = new Program.SectionSpec();
program.sectMap[".data"]        = new Program.SectionSpec();
program.sectMap[".far"]         = new Program.SectionSpec();
program.sectMap[".fardata"]     = new Program.SectionSpec();
program.sectMap[".neardata"]    = new Program.SectionSpec();
program.sectMap[".rodata"]      = new Program.SectionSpec();
program.sectMap[".stack"]       = new Program.SectionSpec();
program.sectMap[".switch"]      = new Program.SectionSpec();
program.sectMap[".sysmem"]      = new Program.SectionSpec();
program.sectMap[".text"]        = new Program.SectionSpec();
    
// Must place these sections in core local memory 
program.sectMap[".args"].loadSegment        = "DDR3";
program.sectMap[".cio"].loadSegment         = "L2SRAM";

// Variables in the following data sections can potentially be 'shared' in
// OpenMP. These sections must be placed in shared memory.
program.sectMap[".bss"].loadSegment         = "DDR3";
program.sectMap[".cinit"].loadSegment       = "DDR3";
program.sectMap[".const"].loadSegment       = "DDR3";
program.sectMap[".data"].loadSegment        = "DDR3";
program.sectMap[".far"].loadSegment         = "DDR3";
program.sectMap[".fardata"].loadSegment     = "DDR3";
program.sectMap[".neardata"].loadSegment    = "DDR3";
program.sectMap[".rodata"].loadSegment      = "DDR3";
program.sectMap[".sysmem"].loadSegment      = "DDR3";

// Code sections shared by cores - place in shared memory to avoid duplication
program.sectMap[".switch"].loadSegment      = program.platform.codeMemory;
program.sectMap[".text"].loadSegment        = "MSMCSRAM";
print(" program.platform.codeMemory = ", program.platform.codeMemory);

// Size the default stack and place it in L2SRAM 
var deviceName = String(Program.cpu.deviceName);
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 0x20000; }
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 256*1024; } // working in serial mode 
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; }
if  (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; }
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 512*1024; } 
else                                    { program.stack = 0x8000;  }
program.sectMap[".stack"].loadSegment       = "L2SRAM"; // works in serial mode 256KB .stack
// program.sectMap[".stack"].loadSegment       = "MSMCSRAM"; // doesnt work in OpenMP

// Since there are no arguments passed to main, set .args size to 0
program.argSize = 0;


/********************************/
/* OPENMP RUNTIME CONFIGURATION */
/********************************/
USE_OPENMP = true;
if (USE_OPENMP) {
	// Include OMP runtime in the build
	var ompSettings = xdc.useModule("ti.runtime.openmp.Settings");
	
	// Set to true if the application uses or has dependencies on BIOS components
	ompSettings.usingRtsc = true;
	
	if (ompSettings.usingRtsc)
	{
	    /* Configure OpenMP for BIOS
	     * - OpenMP.configureCores(masterCoreId, numberofCoresInRuntime)
	     *       Configures the id of the master core and the number of cores
	     *       available to the runtime.
	     */
	
	    var OpenMP = xdc.useModule('ti.runtime.ompbios.OpenMP');
	
	    // Configure the index of the master core and the number of cores available
	    // to the runtime. The cores are contiguous.
	    OpenMP.masterCoreIdx = 0;
	
	    // Setup number of cores based on the device
	    if      (deviceName.search("DRA7XX") != -1) { OpenMP.numCores = 2; }
	    else if (deviceName.search("6670")   != -1) { OpenMP.numCores = 4; }
	    else if (deviceName.search("6657")   != -1) { OpenMP.numCores = 2; }
	    else if (deviceName.search("K2L")    != -1) { OpenMP.numCores = 4; }
	    else                                        { OpenMP.numCores = 8; }
	
	    // Pull in memory ranges described in Platform.xdc to configure the runtime
	    var ddr3       = Program.cpu.memoryMap["DDR3"];
	    var ddr3_nc    = Program.cpu.memoryMap["DDR3_NC"];
	    var msmc       = Program.cpu.memoryMap["MSMCSRAM"];
	    var msmcNcVirt = Program.cpu.memoryMap["OMP_MSMC_NC_VIRT"];
	    var msmcNcPhy  = Program.cpu.memoryMap["OMP_MSMC_NC_PHY"];
	
	    // Initialize the runtime with memory range information
	    if (deviceName.search("DRA7XX") == -1) {
	       OpenMP.msmcBase = msmc.base
	       OpenMP.msmcSize = msmc.len;
	
	       OpenMP.msmcNoCacheVirtualBase  = msmcNcVirt.base;
	       OpenMP.msmcNoCacheVirtualSize  = msmcNcVirt.len;
	
	       OpenMP.msmcNoCachePhysicalBase  = msmcNcPhy.base;
	       
//	       OpenMP.allocateStackFromHeap = true;
//	       OpenMP.allocateStackFromHeapSize = 4*330*1024;
	       
	    }
	    else
	    {
	       OpenMP.allocateStackFromHeap = true;
	       OpenMP.allocateStackFromHeapSize = 0x010000;
	
	       OpenMP.hasMsmc = false;
	       OpenMP.ddrNoCacheBase = ddr3_nc.base;
	       OpenMP.ddrNoCacheSize = ddr3_nc.len;
	    }
	
	    OpenMP.ddrBase          = ddr3.base;
	    OpenMP.ddrSize          = ddr3.len;
	
	    // Configure memory allocation using HeapOMP
	    // HeapOMP handles 
	    // - Memory allocation requests from BIOS components (core local memory)
	    // - Shared memory allocation by utilizing the IPC module to enable 
	    //   multiple cores to allocate memory out of the same heap - used by malloc
	    if (deviceName.search("DRA7XX") == -1) {
	       print("deviceName.search(\"DRA7XX\") == -1")
	       var HeapOMP = xdc.useModule('ti.runtime.ompbios.HeapOMP');
	
	       // Shared Region 0 must be initialized for IPC 
	       var sharedRegionId = 0;
	
	       // Size of the core local heap
	       var localHeapSize  = 0x8000;
	
	       // Size of the heap shared by all the cores
	       // var sharedHeapSize = 0x08000000;
	       var sharedHeapSize = (512-32-22)*1024*1024;
	
	       // Initialize a Shared Region & create a heap in the DDR3 memory region
	       var SharedRegion   = xdc.useModule('ti.sdo.ipc.SharedRegion');
	       SharedRegion.setEntryMeta( sharedRegionId,
	                                  {   base: ddr3.base,
	                                      len:  sharedHeapSize,
	                                      ownerProcId: OpenMP.masterCoreIdx,
	                                      cacheEnable: true,
	                                      createHeap: true,
	                                      isValid: true,
	                                      name: "DDR3_SR0",
	                                  });
	
	       // Configure and setup HeapOMP
	       HeapOMP.configure(sharedRegionId, localHeapSize);
	
	/*       
	       var HeapTrack = xdc.useModule('ti.sysbios.heaps.HeapTrack'); 
			var heapTrackParams = new HeapTrack.Params; 
			heapTrackParams.heap = HeapOMP; 
			var myHeapTracker = HeapTrack.create(heapTrackParams);
			*/
	    }
	    else
	    {
	       OpenMP.useIpcSharedHeap = false;
	       OpenMP.allocateLocalHeapSize = 0x8000
	       OpenMP.allocateSharedHeapSize = 0x00800000
	    }
	
	    
	    var Startup = xdc.useModule('xdc.runtime.Startup');
	    Startup.lastFxns.$add('&__TI_omp_initialize_rtsc_mode');
	}
	else
	{
	    /* Size the heap. It must be placed in shared memory */
	    program.heap = sharedHeapSize;
	}
	

// Use no openmp 
} else {
	var BIOS = xdc.useModule('ti.sysbios.BIOS');	
	BIOS.heapSize = (512-32-22)*1024*1024;
	Program.sectMap["systemHeap"]	= new Program.SectionSpec();
	Program.sectMap["systemHeap"].loadSegment	= "DDR3";
	BIOS.heapSection = "systemHeap";	
}


var BIOS = xdc.useModule('ti.sysbios.BIOS');
print("BIOS.heapSize = ", BIOS.heapSize)
BIOS.cpuFreq.lo = 1000000000;
Program.sectMap[".msmc_bufSec"]    = "MSMCSRAM";

BUILD LOG

**** Build of configuration Debug for project ark_cpplab ****

"c:\\ti\\ccs920\\ccs\\utils\\bin\\gmake.exe" all 
 
Invoking: XDCtools
"C:/ti/ccs920/xdctools_3_60_01_27_core/xs" --xdcpath="C:/ti/openmp_dsp_c667x_2_06_03_00/packages;C:/ti/ipc_3_50_04_08/packages;C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp/platforms;C:/ti/pdk_c667x_2_0_16/packages;C:/ti/bios_6_76_03_01/packages;" xdc.tools.configuro  -o ./Debug/configPkg -t ti.targets.elf.C66 -p ti.runtime.openmp.platforms.evm6678 -r release -c "C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6" "./omp_config.cfg"
making package.mak (because of package.bld) ...
generating interfaces for package configPkg (because package/package.xdc.inc is older than package.xdc) ...
configuring omp_config.xe66 from package/cfg/omp_config_pe66.cfg ...
 program.platform.codeMemory =  MSMCSRAM
deviceName.search("DRA7XX") == -1
BIOS.heapSize =  4096
Configuring OpenMP runtime for device: TMS320C6678
generating custom ti.sysbios library makefile ... 
	Linking with library ti.drv.qmss:./lib/c66/ti.drv.qmss.ae66
	Linking with library ti.csl:./lib/c6678/c66/release/ti.csl.ae66
Starting build of library sources ...
making C:/Users/Feng-laptop/workspace_v9_2_1/ark_cpplab/src/sysbios/sysbios.ae66 ...
Build of libraries done.
cle66 package/cfg/omp_config_pe66.c ...
Finished building: ./omp_config.cfg
hear3 c...
"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/bin/cl6x" -mv6600 --include_path="C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" --include_path="C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp"  --c99 -g --no_warnings --diag_warning=225 --diag_wrap=off --display_error_number --cmd_file="./Debug/configPkg/compiler.opt" -DPLATFORM_MAKEFILE=make_dsp.mk -DCOMPILER_TIC6000 -DDSP_CLOCKS_PER_SEC=1000000000 -DVL_OS_TIRTOS --set_error_limit=3 -DUSE_ARK_FS --opt_level=3 -DDISABLE_SSE -DVL_DISABLE_SSE2  -DDISABLE_AVX -DVL_DISABLE_AVX  --openmp -DUSE_OPENMP -DOMP_NUM_THREADS=1 --output_file DEBUG/obj/whiteboard.obj  -c ark_cpplab/whiteboard/whiteboard.c
"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/bin/cl6x" -mv6600 --include_path="C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" --include_path="C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp"  -g --diag_warning=225 --diag_wrap=off --display_error_number  -z -m"Debug/ark_cpplab.map"   	 -i"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/lib" -i"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" -i"Debug/lib" --reread_libs   --diag_wrap=off --display_error_number --warn_sections --xml_link_info="Debug/ark_cpplab_linkInfo.xml" --rom_model   -o "Debug/ark_cpplab.out" DEBUG/obj/whiteboard.obj -l"./Debug/configPkg/linker.cmd" -llibc.a  -lC:/ti/bios_6_76_02_02/packages/ti/targets/rts6000/lib/boot.ae66 -lC:/ti/bios_6_76_02_02/packages/ti/targets/rts6000/lib/ti.targets.rts6000.ae66 -l./src/sysbios/sysbios.ae66 -l./src/ipc/ipc.ae66 -l./src/sysbios/sysbios.ae66 -l./src/utils/utils.ae66    
<Linking>
"./Debug/configPkg/linker.cmd", line 118: warning #10068-D: no matching section
warning #10247-D: creating output section ".tbss" without a SECTIONS specification
warning #10247-D: creating output section ".tdata" without a SECTIONS specification
-= Build Done! =-

**** Build Finished ****