This thread has been locked.
If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.
Hi All,
As probably some of you might have noticed, I am trying (also struggling) to gain better speed performance on my TMDSEVM6678LE evaluation DSP module using OpenMP (I understand OMP stuff are on vacation right now. I will probably bring this thread to their attention later this month too).
Previously, I started a thread of conversion titled Compiler/TMDSEVM6678: What is the purpose of OpenMP Heap Management API? where I discussed slow runtime when I use OpenMP compared to serial mode.
And, rrlagic kindly suggested to use MSMC RAM as apposed to DDR3 to gain better speed given the fact that MSMC is accessible by all threads simultaneously which actually was very helpful thanks to his valueable comments.
Based on suggestions, I started creating a simple benchmark test to compare and contrast MSMC vs DDR3 with and without using OMP. In the below example, I created two same-size buffers one allocated from MSMC and the second from DDR3 heap using Memory_alloc(NULL, ...) .
I do a simple math on each element of the two buffers with and without OMP. Below is the table of clock cycles took for each of the 4 combinations OMP/NOOMP DDR3/MSMC:
1) Obviously, MSMC did a great job to speed things up to 5 times for char, float, double, and long double when OMP is NOT used. I am just curious why only for those data types ??!!
2) Why using OpenMP is still slower while MSMC is supposed to accessible by each core concurrently?
Please let me know what other stuff I could do potentially to benefit from OpenMP.
Feel free to cast doubt on the coding and criticizing it, I will be more than happy to hear your comments.
I will probably bring this thread to this forum's attention later this month.
Regards,
Benchmark c code (whiteboard.c)
#include <stdio.h> #include <c6x.h> #include <math.h> #include <omp.h> #include <xdc/std.h> #include <xdc/runtime/IHeap.h> #include <xdc/runtime/System.h> #include <xdc/runtime/Memory.h> #include <xdc/runtime/Error.h> #include <xdc/cfg/global.h> typedef unsigned long long dt_uint64; //typedef char buff_datatype; //typedef short buff_datatype; //typedef int buff_datatype; typedef float buff_datatype; //typedef double buff_datatype; //typedef long double buff_datatype; // buffer on MSMC #pragma DATA_SECTION( msmc_buff, ".msmc_bufSec" ) #pragma DATA_ALIGN( msmc_buff, 128 ) #define BUFLEN (40*1024) buff_datatype msmc_buff[BUFLEN]; // buffer on DDR3 allcoated from heap buff_datatype *ddr_buff; // external msmc heap extern const HeapBuf_Handle msmcHeap; buff_datatype *msmc_heap; int main(int argc, char* argv[]){ printf("-= Program Started=- \n"); TSCH = 0; TSCL = 0; dt_uint64 t_start, t_end; Error_Block eb; Error_init(&eb); int blockSize = 256; // NOOMP, Buffer on MSMC t_start = _itoll(TSCH, TSCL); for(int j = 0; j < BUFLEN; j++){ msmc_buff[j] = 1+(msmc_buff[j]/2*3-50)/2*3-50; } t_end = _itoll(TSCH, TSCL); printf("NOOMP MSMC BUFF PROCESS TIME: %llu \n", t_end - t_start); t_start = _itoll(TSCH, TSCL); // OMP, Buffer on MSMC #pragma omp parallel num_threads(8) { #pragma omp for for(int j = 0; j < BUFLEN; j++){ // putchar('M'); putchar('0' + omp_get_thread_num()); msmc_buff[j] = 1+(msmc_buff[j]/2*3-50)/2*3-50; } } t_end = _itoll(TSCH, TSCL); printf("OMP MSMC BUFF PROCESS TIME: %llu \n", t_end - t_start); ddr_buff = Memory_alloc(NULL, BUFLEN*sizeof(buff_datatype), 128, &eb); if (!ddr_buff){ printf("Cannot allocate ddr_buff from heap \n"); exit(1); } // NO-OMP, Buffer on DDR t_start = _itoll(TSCH, TSCL); for(int j = 0; j < BUFLEN; j++){ ddr_buff[j] = 1+(ddr_buff[j]/2*3-50)/2*3-50; } t_end = _itoll(TSCH, TSCL); printf("NOOMP DDR BUFF PROCESS TIME: %llu \n", t_end - t_start); // OMP, Buffer on DDR t_start = _itoll(TSCH, TSCL); #pragma omp parallel num_threads(8) { #pragma omp for for(int j = 0; j < BUFLEN; j++){ ddr_buff[j] = 1+(ddr_buff[j]/2*3-50)/2*3-50; } } t_end = _itoll(TSCH, TSCL); printf("OMP DDR BUFF PROCESS TIME: %llu \n", t_end - t_start); Memory_free(NULL, ddr_buff, BUFLEN*sizeof(buff_datatype)); printf("-= Program Ended=- \n"); }
omp_config.cfg
/***************************/ /* SECTION MAPPING */ /***************************/ var program = xdc.useModule('xdc.cfg.Program'); program.sectMap[".args"] = new Program.SectionSpec(); program.sectMap[".bss"] = new Program.SectionSpec(); program.sectMap[".cinit"] = new Program.SectionSpec(); program.sectMap[".cio"] = new Program.SectionSpec(); program.sectMap[".const"] = new Program.SectionSpec(); program.sectMap[".data"] = new Program.SectionSpec(); program.sectMap[".far"] = new Program.SectionSpec(); program.sectMap[".fardata"] = new Program.SectionSpec(); program.sectMap[".neardata"] = new Program.SectionSpec(); program.sectMap[".rodata"] = new Program.SectionSpec(); program.sectMap[".stack"] = new Program.SectionSpec(); program.sectMap[".switch"] = new Program.SectionSpec(); program.sectMap[".sysmem"] = new Program.SectionSpec(); program.sectMap[".text"] = new Program.SectionSpec(); // Must place these sections in core local memory program.sectMap[".args"].loadSegment = "DDR3"; program.sectMap[".cio"].loadSegment = "L2SRAM"; // Variables in the following data sections can potentially be 'shared' in // OpenMP. These sections must be placed in shared memory. program.sectMap[".bss"].loadSegment = "DDR3"; program.sectMap[".cinit"].loadSegment = "DDR3"; program.sectMap[".const"].loadSegment = "DDR3"; program.sectMap[".data"].loadSegment = "DDR3"; program.sectMap[".far"].loadSegment = "DDR3"; program.sectMap[".fardata"].loadSegment = "DDR3"; program.sectMap[".neardata"].loadSegment = "DDR3"; program.sectMap[".rodata"].loadSegment = "DDR3"; program.sectMap[".sysmem"].loadSegment = "DDR3"; // Code sections shared by cores - place in shared memory to avoid duplication program.sectMap[".switch"].loadSegment = program.platform.codeMemory; program.sectMap[".text"].loadSegment = "MSMCSRAM"; print(" program.platform.codeMemory = ", program.platform.codeMemory); // Size the default stack and place it in L2SRAM var deviceName = String(Program.cpu.deviceName); // if (deviceName.search("DRA7XX") == -1) { program.stack = 0x20000; } // if (deviceName.search("DRA7XX") == -1) { program.stack = 256*1024; } // working in serial mode // if (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; } if (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; } // if (deviceName.search("DRA7XX") == -1) { program.stack = 512*1024; } else { program.stack = 0x8000; } program.sectMap[".stack"].loadSegment = "L2SRAM"; // works in serial mode 256KB .stack // program.sectMap[".stack"].loadSegment = "MSMCSRAM"; // doesnt work in OpenMP // Since there are no arguments passed to main, set .args size to 0 program.argSize = 0; /********************************/ /* OPENMP RUNTIME CONFIGURATION */ /********************************/ USE_OPENMP = true; if (USE_OPENMP) { // Include OMP runtime in the build var ompSettings = xdc.useModule("ti.runtime.openmp.Settings"); // Set to true if the application uses or has dependencies on BIOS components ompSettings.usingRtsc = true; if (ompSettings.usingRtsc) { /* Configure OpenMP for BIOS * - OpenMP.configureCores(masterCoreId, numberofCoresInRuntime) * Configures the id of the master core and the number of cores * available to the runtime. */ var OpenMP = xdc.useModule('ti.runtime.ompbios.OpenMP'); // Configure the index of the master core and the number of cores available // to the runtime. The cores are contiguous. OpenMP.masterCoreIdx = 0; // Setup number of cores based on the device if (deviceName.search("DRA7XX") != -1) { OpenMP.numCores = 2; } else if (deviceName.search("6670") != -1) { OpenMP.numCores = 4; } else if (deviceName.search("6657") != -1) { OpenMP.numCores = 2; } else if (deviceName.search("K2L") != -1) { OpenMP.numCores = 4; } else { OpenMP.numCores = 8; } // Pull in memory ranges described in Platform.xdc to configure the runtime var ddr3 = Program.cpu.memoryMap["DDR3"]; var ddr3_nc = Program.cpu.memoryMap["DDR3_NC"]; var msmc = Program.cpu.memoryMap["MSMCSRAM"]; var msmcNcVirt = Program.cpu.memoryMap["OMP_MSMC_NC_VIRT"]; var msmcNcPhy = Program.cpu.memoryMap["OMP_MSMC_NC_PHY"]; // Initialize the runtime with memory range information if (deviceName.search("DRA7XX") == -1) { OpenMP.msmcBase = msmc.base OpenMP.msmcSize = msmc.len; OpenMP.msmcNoCacheVirtualBase = msmcNcVirt.base; OpenMP.msmcNoCacheVirtualSize = msmcNcVirt.len; OpenMP.msmcNoCachePhysicalBase = msmcNcPhy.base; // OpenMP.allocateStackFromHeap = true; // OpenMP.allocateStackFromHeapSize = 4*330*1024; } else { OpenMP.allocateStackFromHeap = true; OpenMP.allocateStackFromHeapSize = 0x010000; OpenMP.hasMsmc = false; OpenMP.ddrNoCacheBase = ddr3_nc.base; OpenMP.ddrNoCacheSize = ddr3_nc.len; } OpenMP.ddrBase = ddr3.base; OpenMP.ddrSize = ddr3.len; // Configure memory allocation using HeapOMP // HeapOMP handles // - Memory allocation requests from BIOS components (core local memory) // - Shared memory allocation by utilizing the IPC module to enable // multiple cores to allocate memory out of the same heap - used by malloc if (deviceName.search("DRA7XX") == -1) { print("deviceName.search(\"DRA7XX\") == -1") var HeapOMP = xdc.useModule('ti.runtime.ompbios.HeapOMP'); // Shared Region 0 must be initialized for IPC var sharedRegionId = 0; // Size of the core local heap var localHeapSize = 0x8000; // Size of the heap shared by all the cores // var sharedHeapSize = 0x08000000; var sharedHeapSize = (512-32-22)*1024*1024; // Initialize a Shared Region & create a heap in the DDR3 memory region var SharedRegion = xdc.useModule('ti.sdo.ipc.SharedRegion'); SharedRegion.setEntryMeta( sharedRegionId, { base: ddr3.base, len: sharedHeapSize, ownerProcId: OpenMP.masterCoreIdx, cacheEnable: true, createHeap: true, isValid: true, name: "DDR3_SR0", }); // Configure and setup HeapOMP HeapOMP.configure(sharedRegionId, localHeapSize); /* var HeapTrack = xdc.useModule('ti.sysbios.heaps.HeapTrack'); var heapTrackParams = new HeapTrack.Params; heapTrackParams.heap = HeapOMP; var myHeapTracker = HeapTrack.create(heapTrackParams); */ } else { OpenMP.useIpcSharedHeap = false; OpenMP.allocateLocalHeapSize = 0x8000 OpenMP.allocateSharedHeapSize = 0x00800000 } var Startup = xdc.useModule('xdc.runtime.Startup'); Startup.lastFxns.$add('&__TI_omp_initialize_rtsc_mode'); } else { /* Size the heap. It must be placed in shared memory */ program.heap = sharedHeapSize; } // Use no openmp } else { var BIOS = xdc.useModule('ti.sysbios.BIOS'); BIOS.heapSize = (512-32-22)*1024*1024; Program.sectMap["systemHeap"] = new Program.SectionSpec(); Program.sectMap["systemHeap"].loadSegment = "DDR3"; BIOS.heapSection = "systemHeap"; } var BIOS = xdc.useModule('ti.sysbios.BIOS'); print("BIOS.heapSize = ", BIOS.heapSize) BIOS.cpuFreq.lo = 1000000000; Program.sectMap[".msmc_bufSec"] = "MSMCSRAM";
BUILD LOG
**** Build of configuration Debug for project ark_cpplab **** "c:\\ti\\ccs920\\ccs\\utils\\bin\\gmake.exe" all Invoking: XDCtools "C:/ti/ccs920/xdctools_3_60_01_27_core/xs" --xdcpath="C:/ti/openmp_dsp_c667x_2_06_03_00/packages;C:/ti/ipc_3_50_04_08/packages;C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp/platforms;C:/ti/pdk_c667x_2_0_16/packages;C:/ti/bios_6_76_03_01/packages;" xdc.tools.configuro -o ./Debug/configPkg -t ti.targets.elf.C66 -p ti.runtime.openmp.platforms.evm6678 -r release -c "C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6" "./omp_config.cfg" making package.mak (because of package.bld) ... generating interfaces for package configPkg (because package/package.xdc.inc is older than package.xdc) ... configuring omp_config.xe66 from package/cfg/omp_config_pe66.cfg ... program.platform.codeMemory = MSMCSRAM deviceName.search("DRA7XX") == -1 BIOS.heapSize = 4096 Configuring OpenMP runtime for device: TMS320C6678 generating custom ti.sysbios library makefile ... Linking with library ti.drv.qmss:./lib/c66/ti.drv.qmss.ae66 Linking with library ti.csl:./lib/c6678/c66/release/ti.csl.ae66 Starting build of library sources ... making C:/Users/Feng-laptop/workspace_v9_2_1/ark_cpplab/src/sysbios/sysbios.ae66 ... Build of libraries done. cle66 package/cfg/omp_config_pe66.c ... Finished building: ./omp_config.cfg hear3 c... "C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/bin/cl6x" -mv6600 --include_path="C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" --include_path="C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp" --c99 -g --no_warnings --diag_warning=225 --diag_wrap=off --display_error_number --cmd_file="./Debug/configPkg/compiler.opt" -DPLATFORM_MAKEFILE=make_dsp.mk -DCOMPILER_TIC6000 -DDSP_CLOCKS_PER_SEC=1000000000 -DVL_OS_TIRTOS --set_error_limit=3 -DUSE_ARK_FS --opt_level=3 -DDISABLE_SSE -DVL_DISABLE_SSE2 -DDISABLE_AVX -DVL_DISABLE_AVX --openmp -DUSE_OPENMP -DOMP_NUM_THREADS=1 --output_file DEBUG/obj/whiteboard.obj -c ark_cpplab/whiteboard/whiteboard.c "C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/bin/cl6x" -mv6600 --include_path="C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" --include_path="C:/ti/openmp_dsp_c667x_2_06_03_00/packages/ti/runtime/openmp" -g --diag_warning=225 --diag_wrap=off --display_error_number -z -m"Debug/ark_cpplab.map" -i"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/lib" -i"C:/ti/ccs920/ccs/tools/compiler/ti-cgt-c6000_8.3.6/include" -i"Debug/lib" --reread_libs --diag_wrap=off --display_error_number --warn_sections --xml_link_info="Debug/ark_cpplab_linkInfo.xml" --rom_model -o "Debug/ark_cpplab.out" DEBUG/obj/whiteboard.obj -l"./Debug/configPkg/linker.cmd" -llibc.a -lC:/ti/bios_6_76_02_02/packages/ti/targets/rts6000/lib/boot.ae66 -lC:/ti/bios_6_76_02_02/packages/ti/targets/rts6000/lib/ti.targets.rts6000.ae66 -l./src/sysbios/sysbios.ae66 -l./src/ipc/ipc.ae66 -l./src/sysbios/sysbios.ae66 -l./src/utils/utils.ae66 <Linking> "./Debug/configPkg/linker.cmd", line 118: warning #10068-D: no matching section warning #10247-D: creating output section ".tbss" without a SECTIONS specification warning #10247-D: creating output section ".tdata" without a SECTIONS specification -= Build Done! =- **** Build Finished ****