This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Compiler/TMS320C6678: Why OpenMP single thread is much slower than serial code????

Part Number: TMS320C6678

Tool/software: TI C/C++ Compiler

Hi All,

I have a benchmark OpenMP enhanced code which runs much slower than serial code when OpenMP is not enabled. Even if I use all 8 cores I slightly get better speed / execution time which is not good because I was expecting at least 3-4 times speed up compared to serial execution.

Serial Execution Timing(sec)/Output

DDR3 Process Time: 0.651678
OMP DDR3 Process Time: 0.651678
MSMC Process Time: 2.51540
OMP MSMC Process Time: 2.515670

 

Parallel Execution Timing(sec)/Output

DDR3 Process Time: 2.528171                         << OMP Single Thread            <<  THIS IS SO HIGH; why???
OMP DDR3 Process Time: 0.606626                << OMP Multiple Thread         <<  THIS IS SO HIGH; when using 8 cores, we ONLY reach to serial performance. WHY???
MSMC Process Time: 2.515651                        << MSMC Heap OMP Single Thread
OMP MSMC Process Time: 0.242012               << MSMC Heap OMP Multi Thread

Please let me know how that is possible and 1) how that is justified from hardware point of view and 2) why serial is much faster OMP single thread ???

Here is the project zip file of the whole project... 

5383.test_omp.zip

 

and here is how configure, compile and build for Parallel and serial mode.

Serial configure, compile, and build

only for the c file containing the source code!

In the cfg file

OpenMP/Parallel configure, compile, and build



Complete Source Code

/******************************************************************************
* FILE: omp_hello.c
* DESCRIPTION:
*   OpenMP Example - Hello World - C/C++ Version
*   In this simple example, the master thread forks a parallel region.
*   All threads in the team obtain their unique thread number and print it.
*   The master thread only prints the total number of threads.  Two OpenMP
*   library routines are used to obtain the number of threads and each
*   thread's number.
* AUTHOR: Blaise Barney  5/99
* LAST REVISED: 04/06/05
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#if defined(COMPILER_GNU)
#include <stdlib.h>

#if defined(_OPENMP)
#include <omp.h>
#endif

#include <time.h>
#define CLOCKS_PER_SECOND CLOCKS_PER_SEC
#define clock() ((double)clock())

#elif defined(COMPILER_C6X)

#if defined(_OPENMP)
#include <ti/runtime/openmp/omp.h>
#endif

#include <c6x.h>
#include <xdc/std.h>
//#include <xdc/runtime/IHeap.h>
#include <xdc/runtime/System.h>
#include <xdc/runtime/Memory.h>
#include <xdc/runtime/Error.h>
#include <xdc/runtime/IHeap.h>
//#include <ti/sysbios/heaps/IHeapBuf.h>
#include <xdc/cfg/global.h>
#define CLOCKS_PER_SECOND 1e9
#define clock() ((double)_itoll(TSCH, TSCL))
#endif


#define ASSERT(cond, msg, param) if(cond){ } else { printf("\nError, assertion failed in file:'%s' at line: %d: \n %s requested size: %d\n", __FILE__, __LINE__, msg, param); printf("\n param = %d\n", param);exit(1); }
#define BUFF_SIZE (1024*10)
#define NUM_ITER (10000)

//typedef double mydt;
//typedef float mydt;
typedef float mydt;


void process_some_data(mydt* buff_1, mydt* buff_2, mydt* buff_3){
    int i, j;
    for(j = 0; j < NUM_ITER; j++)
        for(i = 0; i < BUFF_SIZE; i++){
            buff_3[i] = buff_1[i]*3.2+(buff_2[i])-93;
        }
}


void omp_process_some_data(mydt* buff_1, mydt* buff_2, mydt* buff_3){
    int j;
    for(j = 0; j < NUM_ITER; j++)
    {
#pragma omp parallel num_threads(8)
        {
        int i;
        #pragma omp for
            for(i = 0; i < BUFF_SIZE; i++){
                buff_3[i] = buff_1[i]*3.2+(buff_2[i])-93;
            }
        }
    }
}


int main () {

#if defined(_OPENMP)
    omp_set_num_threads(8);
#endif

    mydt *ddr_buff_1, *ddr_buff_2, *ddr_buff_3;
    int msize = BUFF_SIZE*sizeof(mydt);
#if defined(COMPILER_GNU)
    ddr_buff_1 = malloc(msize);
    ddr_buff_2 = malloc(msize);
    ddr_buff_3 = malloc(msize);
#elif defined(COMPILER_C6X)
    Error_Block eb;
    Error_init(&eb);
    Error_init(&eb); ddr_buff_1 = Memory_alloc(NULL, msize, 0, &eb);
    Error_init(&eb); ddr_buff_2 = Memory_alloc(NULL, msize, 0, &eb);
    Error_init(&eb); ddr_buff_3 = Memory_alloc(NULL, msize, 0, &eb);
#endif

    ASSERT((int)ddr_buff_1 != 0, "Cannot allocate memory! msize: ", msize );
    ASSERT((int)ddr_buff_2 != 0, "Cannot allocate memory! msize: ", msize );
    ASSERT((int)ddr_buff_3 != 0, "Cannot allocate memory! msize: ", msize );

    double start, end;
#if defined(COMPILER_C6X)
    TSCH = 0; TSCL = 0;
#elif defined(COMPILER_GCC)
    clock_t start, end;
#endif

    start = clock();
    process_some_data(ddr_buff_1, ddr_buff_2, ddr_buff_3);
    end = clock();
    printf("DDR3 Process Time: %f\n", (double)(end-start)/CLOCKS_PER_SECOND);

    start = clock();
    omp_process_some_data(ddr_buff_1, ddr_buff_2, ddr_buff_3);
    end = clock();
    printf("OMP DDR3 Process Time: %f\n", (double)(end-start)/CLOCKS_PER_SECOND);

#if defined(COMPILER_C6X)
    mydt *msmc_buff_1, *msmc_buff_2, *msmc_buff_3;

    Error_init(&eb); msmc_buff_1 = Memory_alloc(msmcHeap, msize, 0, &eb);
    Error_init(&eb); msmc_buff_2 = Memory_alloc(msmcHeap, msize, 0, &eb);
    Error_init(&eb); msmc_buff_3 = Memory_alloc(msmcHeap, msize, 0, &eb);

    ASSERT((int)msmc_buff_1 != 0, "Cannot allocate memory! msize: ", msize );
    ASSERT((int)msmc_buff_2 != 0, "Cannot allocate memory! msize: ", msize );
    ASSERT((int)msmc_buff_3 != 0, "Cannot allocate memory! msize: ", msize );
    start = clock();
    process_some_data(msmc_buff_1, msmc_buff_2, msmc_buff_3);
    end = clock();
    printf("MSMC Process Time: %f\n", (double)(end-start)/CLOCKS_PER_SECOND);

    start = clock();
    omp_process_some_data(msmc_buff_1, msmc_buff_2, msmc_buff_3);
    end = clock();
    printf("OMP MSMC Process Time: %f\n", (double)(end-start)/CLOCKS_PER_SECOND);
#endif
  return 0;
}

Here is the CFG configuration scripts

 

/*
 * Copyright (c) 2012-2015, Texas Instruments Incorporated
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * *  Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * *  Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * *  Neither the name of Texas Instruments Incorporated nor the names of
 *    its contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


/***************************/
/* SECTION MAPPING         */
/***************************/
var program = xdc.useModule('xdc.cfg.Program');

program.sectMap[".args"]        = new Program.SectionSpec();
program.sectMap[".bss"]         = new Program.SectionSpec();
program.sectMap[".cinit"]       = new Program.SectionSpec();
program.sectMap[".cio"]         = new Program.SectionSpec();
program.sectMap[".const"]       = new Program.SectionSpec();
program.sectMap[".data"]        = new Program.SectionSpec();
program.sectMap[".far"]         = new Program.SectionSpec();
program.sectMap[".fardata"]     = new Program.SectionSpec();
program.sectMap[".neardata"]    = new Program.SectionSpec();
program.sectMap[".rodata"]      = new Program.SectionSpec();
program.sectMap[".stack"]       = new Program.SectionSpec();
program.sectMap[".switch"]      = new Program.SectionSpec();
program.sectMap[".sysmem"]      = new Program.SectionSpec();
program.sectMap[".text"]        = new Program.SectionSpec();
    
// Must place these sections in core local memory 
program.sectMap[".args"].loadSegment        = "DDR3";
program.sectMap[".cio"].loadSegment         = "L2SRAM";

// Variables in the following data sections can potentially be 'shared' in
// OpenMP. These sections must be placed in shared memory.
program.sectMap[".bss"].loadSegment         = "DDR3";
program.sectMap[".cinit"].loadSegment       = "DDR3";
program.sectMap[".const"].loadSegment       = "DDR3";
program.sectMap[".data"].loadSegment        = "DDR3";
program.sectMap[".far"].loadSegment         = "DDR3";
program.sectMap[".fardata"].loadSegment     = "DDR3";
program.sectMap[".neardata"].loadSegment    = "DDR3";
program.sectMap[".rodata"].loadSegment      = "DDR3";
program.sectMap[".sysmem"].loadSegment      = "DDR3";

// Code sections shared by cores - place in shared memory to avoid duplication
program.sectMap[".switch"].loadSegment      = program.platform.codeMemory;
program.sectMap[".text"].loadSegment        = "MSMCSRAM";
print(" program.platform.codeMemory = ", program.platform.codeMemory);

// Size the default stack and place it in L2SRAM 
var deviceName = String(Program.cpu.deviceName);
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 0x20000; }
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 256*1024; } // working in serial mode 
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; }
if  (deviceName.search("DRA7XX") == -1) { program.stack = 330*1024; }
// if  (deviceName.search("DRA7XX") == -1) { program.stack = 512*1024; } 
else                                    { program.stack = 0x8000;  }
program.sectMap[".stack"].loadSegment       = "L2SRAM"; // works in serial mode 256KB .stack
// program.sectMap[".stack"].loadSegment       = "MSMCSRAM"; // doesnt work in OpenMP

// Since there are no arguments passed to main, set .args size to 0
program.argSize = 0;


/********************************/
/* OPENMP RUNTIME CONFIGURATION */
/********************************/
USE_OPENMP = false;
if (USE_OPENMP) {
	// Include OMP runtime in the build
	var ompSettings = xdc.useModule("ti.runtime.openmp.Settings");
	
	// Set to true if the application uses or has dependencies on BIOS components
	ompSettings.usingRtsc = true;
	
	if (ompSettings.usingRtsc)
	{
	    /* Configure OpenMP for BIOS
	     * - OpenMP.configureCores(masterCoreId, numberofCoresInRuntime)
	     *       Configures the id of the master core and the number of cores
	     *       available to the runtime.
	     */
	
	    var OpenMP = xdc.useModule('ti.runtime.ompbios.OpenMP');
	
	    // Configure the index of the master core and the number of cores available
	    // to the runtime. The cores are contiguous.
	    OpenMP.masterCoreIdx = 0;
	
	    // Setup number of cores based on the device
	    if      (deviceName.search("DRA7XX") != -1) { OpenMP.numCores = 2; }
	    else if (deviceName.search("6670")   != -1) { OpenMP.numCores = 4; }
	    else if (deviceName.search("6657")   != -1) { OpenMP.numCores = 2; }
	    else if (deviceName.search("K2L")    != -1) { OpenMP.numCores = 4; }
	    else                                        { OpenMP.numCores = 8; }
	
		print("OpenMP.numCores = ", OpenMP.numCores);
	    // Pull in memory ranges described in Platform.xdc to configure the runtime
	    var ddr3       = Program.cpu.memoryMap["DDR3"];
	    var ddr3_nc    = Program.cpu.memoryMap["DDR3_NC"];
	    var msmc       = Program.cpu.memoryMap["MSMCSRAM"];
	    var msmcNcVirt = Program.cpu.memoryMap["OMP_MSMC_NC_VIRT"];
	    var msmcNcPhy  = Program.cpu.memoryMap["OMP_MSMC_NC_PHY"];
	
	    // Initialize the runtime with memory range information
	    if (deviceName.search("DRA7XX") == -1) {
	       OpenMP.msmcBase = msmc.base
	       OpenMP.msmcSize = msmc.len;
	
	       OpenMP.msmcNoCacheVirtualBase  = msmcNcVirt.base;
	       OpenMP.msmcNoCacheVirtualSize  = msmcNcVirt.len;
	
	       OpenMP.msmcNoCachePhysicalBase  = msmcNcPhy.base;
	       
//	       OpenMP.allocateStackFromHeap = true;
//	       OpenMP.allocateStackFromHeapSize = 4*330*1024;
	       
	    }
	    else
	    {
	       OpenMP.allocateStackFromHeap = true;
	       OpenMP.allocateStackFromHeapSize = 0x010000;
	
	       OpenMP.hasMsmc = false;
	       OpenMP.ddrNoCacheBase = ddr3_nc.base;
	       OpenMP.ddrNoCacheSize = ddr3_nc.len;
	    }
	
	    OpenMP.ddrBase          = ddr3.base;
	    OpenMP.ddrSize          = ddr3.len;
	
	    // Configure memory allocation using HeapOMP
	    // HeapOMP handles 
	    // - Memory allocation requests from BIOS components (core local memory)
	    // - Shared memory allocation by utilizing the IPC module to enable 
	    //   multiple cores to allocate memory out of the same heap - used by malloc
	    if (deviceName.search("DRA7XX") == -1) {
	       print("deviceName.search(\"DRA7XX\") == -1")
	       var HeapOMP = xdc.useModule('ti.runtime.ompbios.HeapOMP');
	
	       // Shared Region 0 must be initialized for IPC 
	       var sharedRegionId = 0;
	
	       // Size of the core local heap
	       var localHeapSize  = 0x8000;
	
	       // Size of the heap shared by all the cores
	       // var sharedHeapSize = 0x08000000;
	       var sharedHeapSize = (512-32)*1024*1024;
	
	       // Initialize a Shared Region & create a heap in the DDR3 memory region
	       var SharedRegion   = xdc.useModule('ti.sdo.ipc.SharedRegion');
	       SharedRegion.setEntryMeta( sharedRegionId,
	                                  {   base: ddr3.base,
	                                      len:  sharedHeapSize,
	                                      ownerProcId: OpenMP.masterCoreIdx,
	                                      cacheEnable: true,
	                                      createHeap: true,
	                                      isValid: true,
	                                      name: "DDR3_SR0",
	                                  });
	
	       // Configure and setup HeapOMP
	       HeapOMP.configure(sharedRegionId, localHeapSize);
	
	/*       
	       var HeapTrack = xdc.useModule('ti.sysbios.heaps.HeapTrack'); 
			var heapTrackParams = new HeapTrack.Params; 
			heapTrackParams.heap = HeapOMP; 
			var myHeapTracker = HeapTrack.create(heapTrackParams);
			*/
	    }
	    else
	    {
	       OpenMP.useIpcSharedHeap = false;
	       OpenMP.allocateLocalHeapSize = 0x8000
	       OpenMP.allocateSharedHeapSize = 0x00800000
	    }
	
	    
	    var Startup = xdc.useModule('xdc.runtime.Startup');
	    Startup.lastFxns.$add('&__TI_omp_initialize_rtsc_mode');
	}
	else
	{
	    /* Size the heap. It must be placed in shared memory */
	    program.heap = sharedHeapSize;
	}
	

// Use no openmp 
} else {
	var BIOS = xdc.useModule('ti.sysbios.BIOS');	
	BIOS.heapSize = (512-32-22)*1024*1024;
	Program.sectMap["systemHeap"]	= new Program.SectionSpec();
	Program.sectMap["systemHeap"].loadSegment	= "DDR3";
	BIOS.heapSection = "systemHeap";	
}


var BIOS = xdc.useModule('ti.sysbios.BIOS');
print("BIOS.heapSize = ", BIOS.heapSize);
BIOS.cpuFreq.lo = 1000000000;

var sizeof_Complex = 8, sizeof_int = 4, sizeof_Real = 4;
 
var M = 2;
var N = 1 << 7;
var cgpa_size = N*sizeof_Complex + M*sizeof_int + N*sizeof_Real+ N*sizeof_Real + N*sizeof_Real + N*sizeof_Complex + 2*(sizeof_Complex*M*N);
cgpa_size*=3;

print("cgpa_size = ", cgpa_size);
var msmcHeapSize = 1024*10*8*3;
// var msmcHeapSize = cgpa_size;
var heapType = 'HeapMem';

if (heapType == 'HeapBuf') {
	var HeapBuf = xdc.useModule('ti.sysbios.heaps.HeapBuf');
	var msmcHeapBufParams = new HeapBuf.Params();
	msmcHeapBufParams.blockSize = msmcHeapSize;
	msmcHeapBufParams.align = 128;
	msmcHeapBufParams.numBlocks = 3;
	msmcHeapBufParams.sectionName = "msmcHeap";
	Program.global.msmcHeap = HeapBuf.create(msmcHeapBufParams);
	Program.sectMap["msmcHeap"] = "MSMCSRAM";
} else if (heapType == 'HeapMem') {
	var HeapMem = xdc.useModule('ti.sysbios.heaps.HeapMem');
	var heapMemParams = new HeapMem.Params();
	heapMemParams.size = msmcHeapSize;
	heapMemParams.sectionName = "msmcHeap";
	Program.global.msmcHeap = HeapMem.create(heapMemParams);
	Program.sectMap["msmcHeap"] = "MSMCSRAM";
	
} else {
	error("Unrecognized heap type! ");
}