/******************************************************************************
* FILE: omp_hello.c
* DESCRIPTION:
*   OpenMP Example - Hello World - C/C++ Version
*   In this simple example, the master thread forks a parallel region.
*   All threads in the team obtain their unique thread number and print it.
*   The master thread only prints the total number of threads.  Two OpenMP
*   library routines are used to obtain the number of threads and each
*   thread's number.
* AUTHOR: Blaise Barney  5/99
* LAST REVISED: 04/06/05
******************************************************************************/
#include <ti/omp/omp.h>
//#include <omp.h>
#include <stdio.h>

#include <xdc/std.h>
#include <time.h>
#include <stdlib.h>
#include <limits.h>
#include <math.h>
#include <c6x.h>

//#include "omp/omp_config.h"
#include <fft_omp_sp_1d_r2c/fft_omp_sp_1d_r2c.h>
#include "fft_edma.h"
#include <ti/dsplib/src/DSPF_sp_fftSPxSP/DSPF_sp_fftSPxSP.h>
//#include <ti/runtime/openmp/omp.h>

extern cregister volatile unsigned int DNUM;
#define ACTIVE_THREAD_COUNT (2)

#define FFT_EDMA_STATE_INIT         (0)
#define FFT_EDMA_STATE_ALLOCATED    (1)
#define OMP_MAX_NUM_CORES           (2)

int fftEdmaState[ACTIVE_THREAD_COUNT];
FFT_EDMA_Struct gEdmaState[OMP_MAX_NUM_CORES];
#pragma DATA_SECTION (gEdmaState, ".msmc_mem")

/* ======================================================================== */
/*  Kernel-specific alignments                                              */
/* ======================================================================== */
#pragma DATA_SECTION(x_i, ".ddr_mem");
#pragma DATA_SECTION(y_i, ".ddr_mem");
#pragma DATA_SECTION(w_i, ".ddr_mem");
#pragma DATA_SECTION(x_cn, ".ddr_mem");
#pragma DATA_SECTION(y_cn, ".ddr_mem");
#pragma DATA_SECTION(w_cn, ".ddr_mem");

#pragma DATA_ALIGN(x_i,  8);
#pragma DATA_ALIGN(x_cn, 8);

#pragma DATA_ALIGN(w_cn, 8);

#pragma DATA_ALIGN(y_i,  8);
#pragma DATA_ALIGN(y_cn, 8);

#pragma DATA_SECTION(x_i_work, ".ll2_mem");
#pragma DATA_SECTION(y_i_work, ".ll2_mem");
#pragma DATA_SECTION(y_i_temp, ".ll2_mem");
#pragma DATA_SECTION(w_i_work, ".ll2_mem");

#pragma DATA_ALIGN(w_i,  8);
#pragma DATA_ALIGN(w_i_work,  8);
#pragma DATA_ALIGN(x_i_work,  64);
#pragma DATA_ALIGN(y_i_work,  64);

/* ======================================================================== */
/*  Parameters of fixed dataset.                                            */
/* ======================================================================== */

#define MAXN  (2024*2048)
#define M     (2*MAXN)
#define M_i   (4*2048)
#define PAD   (0)

/* ======================================================================== */
/*  Initialized arrays with fixed test data.                                */
/* ======================================================================== */

float x_i [M + 2 * PAD];
float x_cn[M + 2 * PAD];

float y_i [M + 2 * PAD];
float y_cn[M + 2 * PAD];

float x_i_work [M_i*2*NUMOFLINEBUFS + 2 * PAD];
float y_i_work [M_i*2*NUMOFLINEBUFS + 2 * PAD];
float y_i_temp [M_i*NUMOFLINEBUFS + 2 * PAD];

float w_i_work [2 + 2048/2 + 2*2048 + 2 * PAD];

float w_i [2 + 2048/2 + 2*2048 + 2 * PAD];
float w_cn[M + 2 * PAD];

float magDiv = 3.0/8.0;
/* ======================================================================== */
/*  Generate pointers to skip beyond array padding                          */
/* ======================================================================== */
float *const ptr_x_i  = x_i  + PAD;
float *const ptr_x_cn = x_cn + PAD;

float *const ptr_w_i  = w_i  + PAD;
float *const ptr_w_cn = w_cn + PAD;

float *const ptr_y_i  = y_i  + PAD;
float *const ptr_y_cn = y_cn + PAD;

float *const ptr_y_i_temp  = y_i_temp + PAD;
float *const ptr_y_i_work  = y_i_work + PAD;
float *const ptr_x_i_work  = x_i_work + PAD;
float *const ptr_w_i_work  = w_i_work + PAD;

void fft_assert(int statement, int node_id, const char *error)
{
    volatile int dbg_halt = 1;

    if(!statement) {
        printf("%s (%d)\n",error,node_id);
        while(dbg_halt);
    }
}

void fft_memory_request (int nbufs, FFTmemBuffer_t *bufs)
{
    int i;

    printf ("FFT memory buffers:\n");
    printf ("    Buffer    Size(bytes)    Alignment\n");
    for (i = 0; i < nbufs; i++) {
        printf ("     %3d       %8d         %4d       \n", i, (int)bufs[i].size, (int)bufs[i].log2align);
    }
    bufs[0].base = ptr_x_i;
    bufs[1].base = ptr_y_i;
    bufs[2].base = ptr_w_i;

    bufs[3].base = ptr_x_i_work;
    bufs[4].base = ptr_y_i_work;
    bufs[5].base = ptr_w_i_work;
    bufs[6].base = ptr_y_i_temp;

} /* fft_memory_request */

void *fft_omp_assign_edma_resources(void)
{
    /*
    * The edmaInstances are indexes into the C6678_config[] array defined in
    * fft_c6678_config, which is used to specify how EDMA resources are
    * divided between cores.
    */
    void *ret = (void *) (&gEdmaState[0]);

    #pragma omp parallel
    {
        if ( fftEdmaState[DNUM] != FFT_EDMA_STATE_ALLOCATED )
        {
            gEdmaState[DNUM].num_channels = 0;
            while ( gEdmaState[DNUM].num_channels < FFT_NUM_EDMA_CH )
            {
                fft_assert( ((gEdmaState[DNUM].channel[gEdmaState[DNUM].num_channels]) = EdmaMgr_alloc(FFT_MAX_EDMA_LINKS)) != NULL , DNUM, "EdmaMgr_alloc() failed ");
                gEdmaState[DNUM].num_channels++;
            }
        }
        fftEdmaState[DNUM] = FFT_EDMA_STATE_ALLOCATED;
    }

    return ret;
}

void fft_omp_free_edma_resources(void *edma)
{
    /*
    * The edmaInstances are indexes into the C6678_config[] array defined in
    * fft_c6678_config, which is used to specify how EDMA resources are
    * divided between cores.
    */
    int ret_val;

    #pragma omp parallel
    {
        if ( fftEdmaState[DNUM] == FFT_EDMA_STATE_ALLOCATED )
        {
            while ( gEdmaState[DNUM].num_channels > 0 )
            {
                gEdmaState[DNUM].num_channels--;
                ret_val = EdmaMgr_free(gEdmaState[DNUM].channel[gEdmaState[DNUM].num_channels]);
                fft_assert( ret_val == EdmaMgr_SUCCESS, DNUM, "EDMA free failed!");
            }
        }
        fftEdmaState[DNUM] = FFT_EDMA_STATE_INIT;
    }
}

void fft_memory_release (int nbufs, FFTmemBuffer_t *bufs)
{
      /* do nothing for now */
} /* fft_memory_request */

int main (int argc, char *argv[]) {

    int     i, j, N, k = 0;
    clock_t t_start, t_stop, t_overhead, t_opt;
    float   diff, max_diff = 0, absReal, absImg, max, min;
    fft_plan_t p;
    fft_callout_t plan_fxns;

    N = MAXN;

    //initialize hardware timers
    TSCL=0;TSCH=0;

    // initialize callout functions
    plan_fxns.memoryRequest   = fft_memory_request;
    plan_fxns.memoryRelease   = fft_memory_release;
    plan_fxns.ecpyRequest = fft_omp_assign_edma_resources;
    plan_fxns.ecpyRelease = fft_omp_free_edma_resources;

    // initialize ECPY
    omp_set_num_threads (ACTIVE_THREAD_COUNT);

    #pragma omp parallel
    {
        fft_assert( (EdmaMgr_init(DNUM, NULL) == EdmaMgr_SUCCESS), DNUM, "EdmaMgr_init() return error!");
        fftEdmaState[DNUM] = FFT_EDMA_STATE_INIT;
    }

    //Force uninitialized arrays to fixed values
    memset (x_i,  0x55, sizeof (x_i) );
    memset (x_cn, 0x55, sizeof (x_cn));
    memset (y_i,  0xA5, sizeof (y_i) );
    memset (y_cn, 0xA5, sizeof (y_cn));

    // Initialize input vector temporarily
    printf("Initializing input vectors\n");
    for (j = 0; j < N; j++) {
      x_i[j] = sin (2 * 3.1415 * 1000 * j / (double) N);
    }

    // Create fft plan
    printf("Creating plan\n");
    p = fft_omp_sp_plan_1d_r2c (N, FFT_ECPY, plan_fxns);

    //Compute the overhead of allocating and freeing EDMA
    t_start = _itoll(TSCH, TSCL);
    p.edmaState = fft_omp_assign_edma_resources();
    fft_omp_free_edma_resources(p.edmaState);
    t_stop  = _itoll(TSCH, TSCL);
    t_overhead = t_stop - t_start;

    // ecpy fft
    printf("FFT executing\n");
    t_start = _itoll(TSCH, TSCL);
    fft_execute (p);
    t_stop = _itoll(TSCH, TSCL);
    t_opt  = (t_stop - t_start) - t_overhead;   // calculate clock cycles
    printf("Clock cycles for execute: %d\n", t_opt);

    // Calculate magnitude
    printf("Starting FFT magnitude (MAX-MIN)\n");
    t_start = _itoll(TSCH, TSCL);               // start counter
    for (i = 0; i < N; i+=2) {
        absReal = _fabsf(y_i[i]);
        absImg = _fabsf(y_i[i+1]);
        if(absReal > absImg)
        {
            max = absReal;
            min = absImg;
        }
        else
        {
            max = absImg;
            min = absReal;
        }
        y_cn[k] = (max + (min * magDiv));
        k++;
    }
    t_stop = _itoll(TSCH, TSCL);                // stop counter
    t_opt  = (t_stop - t_start) - t_overhead;   // calculate clock cycles
    printf("Clock cycles for magnitude (MAX-MIN): %d\n", t_opt);

    fft_destroy_plan (p);

    //compute difference and track max difference
    diff = 0; max_diff = 0;
    for(i=0; i<2*N; i++) {
      diff = _fabs(ptr_y_cn[i] - ptr_x_i[i]);
      if (diff > max_diff) max_diff = diff;
    }

    printf("fft_omp_sp_1d_r2c_ecpy\tsize= %d\n", N);
    printf("max_diff = %f", max_diff);

    return 0;
}
