1.when ,keep running ti opencl example conv1d , linux ram will continue to increase.
QueuetoSubmit is 60000us some time.
why?
QueuetoSubmit=6520 us, SubmittoStart=68163 us, StarttoEnd=353829 us, dspruntime_us=4115.766602 us,
2 I modify the example,the inux ram will not continue to increase.
but QueuetoSubmit will is 6000us some time.
code is :
main.cpp:
/******************************************************************************
* Copyright (c) 2017, Texas Instruments Incorporated - http://www.ti.com/
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Texas Instruments Incorporated nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <unistd.h>
#include <time.h>
#include <malloc.h>
#include <string.h>
#include "ocl_util.h"
#include "ti_kernels.dsp_h"
//#define TI_VERSION
struct mallinfo mi_1;
// input image is COLSxROWS, filtered output image is COLSxROWS
// while filtering is applied to each row in image
#define COLS 1920//1920/10 //1920 418
#define ROWS 1080//1080/10 //1080 8
#define COLORDEPTH 12
// 1-D size 5 Gaussion Kernel, code is generic for non-symmetric kernel
#define FILTERSIZE 5
float FILTER[FILTERSIZE] = { 0.06136f, 0.24477f, 0.38774f, 0.24477f, 0.06136f };
#define EPISOLON 0.00001f
using namespace cl;
using namespace std;
static unsigned us_diff (struct timespec &t1, struct timespec &t2)
{ return (t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_nsec - t1.tv_nsec) / 1e3; }
static int VerifyResults(CommandQueue &Q, Buffer &pOutput,
float *pGolden, int cols, int rows);
void input_init(float *pInput,int bufSize,int VALRANGE_);
void convfilte(float *pGolden,float *pInput,float *filtercoef,
int filtlen,int bufSize,int ROWS_, int COLS_ );
typedef struct timeout_
{
cl_ulong QueuetoSubmit_;
cl_ulong SubmittoStart_;
cl_ulong StarttoEnd_;
cl_ulong timetotal_us_;
cl_ulong timeout_us_;
cl_ulong timeout_us_max_;
}timeout_str;
void ocl_event_times_tst_init( timeout_str *timeout_obj,cl_ulong timeout_us);
void ocl_event_times_tst_1(const cl::Event &ev, const char* name, timeout_str *timeout_obj);
void log_dsp(const char *file_name,const char *kernelName,int *timeoutcnt, timeout_str timeout_obj,int timeus_linux,float dspruntime_us,struct mallinfo mi_1);
void dspruntime(float *dspruntime_us,CommandQueue &Q, Buffer &bufdspruntime_us);
void bufWrite(float *src,CommandQueue &Q, Buffer &bufDst,int bufsize);
void bufWrite_end(float *src,CommandQueue &Q, Buffer &bufDst,int bufsize);
void bufRead(float *dest,CommandQueue &Q, Buffer &bufSrc,int bufsize);
cl_ulong timeout_us=0;
#define RETURN(x) return x
int ModyFun(int argc ,char *argv[]);
int TI_tst(int argc, char *argv[]);
int main(int argc, char *argv[])
{
#ifdef TI_VERSION // *///////////////////////////////
return TI_tst( argc, argv);
#else // *///////////////////////////////
return ModyFun( argc ,argv);
#endif
}
static int VerifyResults(CommandQueue &Q, Buffer &bufOutput,
float *pGolden, int cols, int rows)
{
float *pOutput = (float *) Q.enqueueMapBuffer(bufOutput, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0, cols*rows*sizeof(float));
int num_errors = 0;
for (int i = 0; i < cols * rows; i++)
{
if ((pOutput[i] - pGolden[i] > EPISOLON) ||
(pGolden[i] - pOutput[i] > EPISOLON))
if (num_errors++ < 10)
printf("Result diff at %d: expect %f, got %f\n", i, pGolden[i],
pOutput[i]);
}
if (num_errors != 0) printf("Total %d errors\n", num_errors);
// reset output buffer for next test
Event ev1;
memset(pOutput, 0, cols*rows*sizeof(float));
Q.enqueueUnmapMemObject(bufOutput, pOutput, NULL, &ev1);
ev1.wait();
return num_errors;
}
void input_init(float *pInput,int bufSize,int VALRANGE_)
{
for (int i = 0; i < bufSize; i++)
pInput[i] = (float) (rand() % VALRANGE_);
}
void convfilte(float *pGolden,float *pInput,float *filtercoef,int filtlen,int bufSize,int ROWS_, int COLS_ )
{
for (int j = 0; j < ROWS_; j++)
for (int i = 0; i < COLS_; i++)
{
float left_2 = pInput[j * COLS + (i-2 < 0 ? 0 : i-2)];
float left_1 = pInput[j * COLS + (i-1 < 0 ? 0 : i-1)];
float self = pInput[j * COLS + i];
float right_1 = pInput[j * COLS + (i+1 >= COLS ? COLS-1 : i+1)];
float right_2 = pInput[j * COLS + (i+2 >= COLS ? COLS-1 : i+2)];
pGolden[j*COLS + i] = left_2 * filtercoef[0] + left_1 * filtercoef[1]
+ self * filtercoef[2]
+ right_1 * filtercoef[3] + right_2 * filtercoef[4];
}
}
void ocl_event_times_tst_1(const cl::Event &ev, const char* name, timeout_str *timeout_obj)
{
cl_ulong t_que, t_sub, t_strt, t_end;
ev.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &t_que);
ev.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &t_sub);
ev.getProfilingInfo(CL_PROFILING_COMMAND_START, &t_strt);
ev.getProfilingInfo(CL_PROFILING_COMMAND_END, &t_end);
/*----------------------------------------------------------------------
* Normalize the time to microseconds
*--------------------------------------------------------------------*/
t_que /= 1000; t_sub /= 1000; t_strt /= 1000; t_end /= 1000;
cl_ulong QueuetoSubmit=t_sub-t_que;
cl_ulong SubmittoStart=t_strt-t_sub;
cl_ulong StarttoEnd=t_end-t_strt;
cl_ulong timeout_us=timeout_obj->timeout_us_;
cl_ulong timetotal_us=QueuetoSubmit+SubmittoStart+ StarttoEnd;
timeout_obj->QueuetoSubmit_= t_sub-t_que;
timeout_obj->SubmittoStart_= SubmittoStart;
timeout_obj->StarttoEnd_= StarttoEnd;
timeout_obj->timetotal_us_= timetotal_us;
if(timeout_obj->timetotal_us_ > timeout_obj->timeout_us_max_)
timeout_obj->timeout_us_max_=timeout_obj->timetotal_us_;
if (!name) name = "";
if(timetotal_us>=timeout_us)
{
printf("++++ocl_event_times_tst %s : ",name);
printf("Queue to Submit: %lld us\n",t_sub-t_que);
printf("++++ocl_event_times_tst %s : ",name);
printf("Submit to Start : %lld us\n",t_strt-t_sub);
printf("++++ocl_event_times_tst %s : ",name);
printf("Start to End : %lld us\n",t_end-t_strt);
// printf("++++ocl_event_times_tst %s : Start to End : %d us\n",name,t_end-t_strt);
// cout<< name << " : Queue to Submit: " << t_sub-t_que << " us" << endl;
// cout<< name << " : Submit to Start : " << t_strt-t_sub << " us" << endl;
// cout<< name << " : Start to End : " << t_end-t_strt << " us" << endl;
// cout<< endl;
}
}
void ocl_event_times_tst_init( timeout_str *timeout_obj,cl_ulong timeout_us)
{
timeout_obj->QueuetoSubmit_=0;
timeout_obj->SubmittoStart_=0;
timeout_obj->StarttoEnd_=0;
timeout_obj->timetotal_us_=0;
timeout_obj->timeout_us_=timeout_us;
timeout_obj->timeout_us_max_=0;
}
void log_dsp(const char *file_name,const char *kernelName,int *timeoutcnt, timeout_str timeout_obj,int timeus_linux,float dspruntime_us,struct mallinfo mi_1)
{
//char *file_name="file_dsplog_conv1d_tst.txt";
if(timeout_obj.timeout_us_max_>timeout_obj.timeout_us_)
{
FILE *file_dsplog;
// file_dsplog=fopen(file_name,"w"); // /home/root/dsp_test/
file_dsplog=fopen(file_name,"a+");
// fprintf(file_dsplog,"************************\n");
// fprintf(file_dsplog,"************************\n");
// fprintf(file_dsplog,"DSP running log:\n");
timeoutcnt[0]+=1;
fprintf(file_dsplog,"kernelName=%s,",kernelName);
fprintf(file_dsplog,"QueuetoSubmit=%lld us, ",timeout_obj.QueuetoSubmit_);
fprintf(file_dsplog,"SubmittoStart=%lld us, ",timeout_obj.SubmittoStart_);
fprintf(file_dsplog,"StarttoEnd=%lld us, ",timeout_obj.StarttoEnd_);
fprintf(file_dsplog,"dspruntime_us=%f us, ",dspruntime_us);
fprintf(file_dsplog,"max_time_out=%lld us, ",timeout_obj.timeout_us_max_);
fprintf(file_dsplog,"timeout_us=%lld us, ",timeout_obj.timeout_us_);
fprintf(file_dsplog,"timeus_linux=%d us, ",timeus_linux);
fprintf(file_dsplog," cnt_time_out=%d, ",timeoutcnt[0]);
//mi_1 = mallinfo();
fprintf(file_dsplog," mallinfo=%d, ",mi_1.uordblks);
// printf("\n mallinfo :%d \n",mi_1.uordblks); //print memory overflow
fprintf(file_dsplog,"\n");
fclose(file_dsplog);
}
}
void dspruntime(float *dspruntime_us,CommandQueue &Q, Buffer &bufdspruntime_us)
{
#if 0
float *pdspruntime_us = (float *) Q.enqueueMapBuffer(bufdspruntime_us, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0, 1*sizeof(float));
*dspruntime_us=*pdspruntime_us;
// //printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
Event ev;
//memset(&dspruntime_us, 0, 1*sizeof(float));
Q.enqueueUnmapMemObject(bufdspruntime_us, pdspruntime_us, NULL, &ev);
ev.wait();
#else
bufRead(dspruntime_us,Q, bufdspruntime_us,1*sizeof(float));
#endif
}
void bufWrite(float *src,CommandQueue &Q, Buffer &bufDst,int bufsize)
{
src = (float *) Q.enqueueMapBuffer(bufDst, CL_TRUE,
CL_MAP_WRITE, 0, bufsize);
printf("++++bufWrite fun\n");
}
void bufWrite_end(float *src,CommandQueue &Q, Buffer &bufDst,int bufsize)
{
Q.enqueueUnmapMemObject(bufDst, src);
Q.finish();
printf("++++bufWrite_end fun\n");
}
void bufRead(float *dest,CommandQueue &Q, Buffer &bufSrc,int bufsize)
{
int i=0;
float *dest_t = (float *) Q.enqueueMapBuffer(bufSrc, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0, bufsize);
for(i=0;i<bufsize/4;i++)
{
dest[i]=dest_t[i];
}
// //printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
Event ev;
//memset(&dspruntime_us, 0, 1*sizeof(float));
Q.enqueueUnmapMemObject(bufSrc, dest_t, NULL, &ev);
ev.wait();
ev.~Wrapper();
Q.finish();
}
int ModyFun(int argc ,char *argv[])
{
int timeoutcnt=0;
struct timespec t0, t1;
int num_errors = 0;
int input_numcompunits = 0;
if (argc > 1) input_numcompunits = atoi(argv[1]); // valid: 1, 2, 4, 8
timeout_us=18000;
if(argc > 2) timeout_us = atoi(argv[2]);
printf("+++timeout_us=%lld us\n",timeout_us);
Context context (CL_DEVICE_TYPE_ACCELERATOR);
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue Q(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
int NUMCOMPUNITS;
cl_ulong LOCALMEMSIZE;
devices[0].getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &NUMCOMPUNITS);
devices[0].getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &LOCALMEMSIZE);
printf("LOCALMEMSIZE=%d\n",LOCALMEMSIZE);
printf("NUMCOMPUNITS=%d\n",NUMCOMPUNITS);
if (input_numcompunits > 0 && NUMCOMPUNITS > input_numcompunits)
NUMCOMPUNITS = input_numcompunits;
printf("NUMCOMPUNITS=%d\n",NUMCOMPUNITS);
Program::Binaries binary(1, make_pair(ti_kernels_dsp_bin,
sizeof(ti_kernels_dsp_bin)));
Program program = Program(context, devices, binary);
program.build(devices);
float dspruntime_us=0;
int bufSize = COLS * ROWS * sizeof(float);
Buffer bufInput( context, CL_MEM_READ_WRITE, bufSize); //CL_MEM_READ_ONLY
Buffer bufOutput(context, CL_MEM_READ_WRITE, bufSize); //CL_MEM_WRITE_ONLY
Buffer bufFilter(context, CL_MEM_READ_ONLY, FILTERSIZE * sizeof(float));
Buffer bufdspruntime_us(context, CL_MEM_READ_WRITE, 1 * sizeof(float));
int BLOCK_HEIGHT = LOCALMEMSIZE / (2 * 2 * COLS * sizeof(float));
printf("BLOCK_HEIGHT=%d\n",BLOCK_HEIGHT);
// Prepare testing data, compute golden output on host
float *pGolden = (float *) malloc(bufSize);
if (pGolden == NULL)
{
printf("Failed to allocate memory for golden results\n");
exit(0);
}
#if 1
const char *file_name="file_dsplog_conv1d_tst.txt";
FILE *file_dsplog;
// file_dsplog=fopen(file_name,"w"); // /home/root/dsp_test/
file_dsplog=fopen(file_name,"a+");
fprintf(file_dsplog,"************************\n");
fprintf(file_dsplog,"************************\n");
fprintf(file_dsplog,"DSP running log:\n");
fprintf(file_dsplog,"NUMCOMPUNITS=%d,",NUMCOMPUNITS);
fprintf(file_dsplog,"timeout_us=%lld us, ",timeout_us);
fprintf(file_dsplog,"COLS=%d ,ROWS=%d,\n",COLS,ROWS);
fclose(file_dsplog);
#endif
//////////////////////pInput 赋值,host pGolden 2d卷积计算 start//////////////////////////
float *pInput;
#if 1
pInput = (float *) Q.enqueueMapBuffer(bufInput, CL_TRUE,
CL_MAP_WRITE, 0, bufSize);
int VALRANGE = (1 << COLORDEPTH);
input_init(pInput, COLS * ROWS, VALRANGE);
srand(time(NULL));
clock_gettime(CLOCK_MONOTONIC, &t0);
convfilte(pGolden,pInput,FILTER, FILTERSIZE, COLS * ROWS, ROWS, COLS );
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (host_compute): %d usecs\n", us_diff(t0, t1));
Q.enqueueUnmapMemObject(bufInput, pInput);
Q.finish();
#else //error
//float *pInput;
printf("++++bufWrite fun used\n");
bufWrite(pInput,Q, bufInput, bufSize);
printf("++++bufWrite fun used over\n");
int VALRANGE = (1 << COLORDEPTH);
input_init(pInput, COLS * ROWS, VALRANGE);
srand(time(NULL));
clock_gettime(CLOCK_MONOTONIC, &t0);
convfilte(pGolden,pInput,FILTER, FILTERSIZE, COLS * ROWS, ROWS, COLS );
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (host_compute): %d usecs\n", us_diff(t0, t1));
bufWrite_end(pInput,Q, bufInput, bufSize);
#endif
// free(pInput);
///////////////////////pInput 赋值,host pGolden 2d卷积计算 end////////////////////////
Q.enqueueWriteBuffer(bufFilter, CL_TRUE, 0, FILTERSIZE * sizeof(float),
FILTER);
///////////////////////////////
Kernel kernel(program, "null");
Kernel k_base(program, "k_conv1d_5x1");
k_base.setArg(0, bufInput);
k_base.setArg(1, bufOutput);
k_base.setArg(2, bufFilter);
k_base.setArg(3, COLS);
k_base.setArg(4, ROWS);
k_base.setArg(5, COLS);
k_base.setArg(6, COLS);
k_base.setArg(7, bufdspruntime_us);
Kernel k_loop(program, "k_loop");
k_loop.setArg(0, bufInput);
k_loop.setArg(1, bufOutput);
k_loop.setArg(2, bufFilter);
k_loop.setArg(3, COLS);
k_loop.setArg(4, ROWS);
k_loop.setArg(5, COLS);
k_loop.setArg(6, COLS);
k_loop.setArg(7, bufdspruntime_us);
Kernel k_loop_simd(program, "k_loop_simd");
k_loop_simd.setArg(0, bufInput);
k_loop_simd.setArg(1, bufOutput);
k_loop_simd.setArg(2, bufFilter);
k_loop_simd.setArg(3, COLS);
k_loop_simd.setArg(4, ROWS);
k_loop_simd.setArg(5, COLS);
k_loop_simd.setArg(6, COLS);
k_loop_simd.setArg(7, bufdspruntime_us);
Kernel k_loop_db(program, "k_loop_db");
k_loop_db.setArg(0, bufInput);
k_loop_db.setArg(1, bufOutput);
k_loop_db.setArg(2, bufFilter);
k_loop_db.setArg(3, COLS);
k_loop_db.setArg(4, ROWS);
k_loop_db.setArg(5, COLS);
k_loop_db.setArg(6, COLS);
k_loop_db.setArg(7, BLOCK_HEIGHT);
k_loop_db.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_db.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_db.setArg(10, bufdspruntime_us);
Kernel k_loop_simd_db(program, "k_loop_simd_db");
k_loop_simd_db.setArg(0, bufInput);
k_loop_simd_db.setArg(1, bufOutput);
k_loop_simd_db.setArg(2, bufFilter);
k_loop_simd_db.setArg(3, COLS);
k_loop_simd_db.setArg(4, ROWS);
k_loop_simd_db.setArg(5, COLS);
k_loop_simd_db.setArg(6, COLS);
k_loop_simd_db.setArg(7, BLOCK_HEIGHT);
k_loop_simd_db.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db.setArg(10, bufdspruntime_us);
Kernel k_loop_simd_db_extc(program, "k_loop_simd_db_extc");
k_loop_simd_db_extc.setArg(0, bufInput);
k_loop_simd_db_extc.setArg(1, bufOutput);
k_loop_simd_db_extc.setArg(2, bufFilter);
k_loop_simd_db_extc.setArg(3, COLS);
k_loop_simd_db_extc.setArg(4, ROWS);
k_loop_simd_db_extc.setArg(5, COLS);
k_loop_simd_db_extc.setArg(6, COLS);
k_loop_simd_db_extc.setArg(7, BLOCK_HEIGHT);
k_loop_simd_db_extc.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db_extc.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db_extc.setArg(10, bufdspruntime_us);
try
{
while(1)
{
// The OpenCL runtime will lazily load the device program upon the first
// enqueue of a kernel from the program, so the elapsed time overall from
// the first enqueue will be longer to account for the loading of the
// program. To remove program loading overhead from kernel performance,
// enqueue a null kernel before running other kernels.
KernelFunctor null = kernel.bind(Q, NDRange(1), NDRange(1));
clock_gettime(CLOCK_MONOTONIC, &t0);
null().wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (loading program): %d usecs\n", us_diff(t0, t1));
///////////////////////k_conv1d_5x1 start //////////////////
Event ev;
// straight-forward kernel (unoptimized, baseline)
timeout_str timeout_k_base;
ocl_event_times_tst_init( &timeout_k_base, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_base, NullRange, NDRange(COLS, ROWS),
NDRange(COLS, ROWS/NUMCOMPUNITS), NULL, &ev);
ev.wait();
ocl_event_times_tst_1(ev, "k_base " , &timeout_k_base);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo())
mi_1 = mallinfo();
log_dsp(file_name, "k_base ",&timeoutcnt, timeout_k_base,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_baseline): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
if(us_diff(t0, t1)>timeout_us)
{
mi_1 = mallinfo();
printf("\n mallinfo :%d \n",mi_1.uordblks); //print memory overflow
}
ev.~Wrapper();
///////////////k_conv1d_5x1 end ////////////
///////////////k_loop end start////////////
Event ev1;
timeout_str timeout_k_loop;
ocl_event_times_tst_init( &timeout_k_loop, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev1);
ev1.wait();
ocl_event_times_tst_1(ev1, "k_loop " , &timeout_k_loop);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
log_dsp(file_name, "k_loop ",&timeoutcnt, timeout_k_loop,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_loop): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
ev1.~Wrapper();
///////////////k_loop end end////////////
///////////////k_loop_simd start////////////
Event ev2;
timeout_str timeout_k_loop_simd;
ocl_event_times_tst_init( &timeout_k_loop_simd, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev2);
ev2.wait();
ocl_event_times_tst_1(ev2, "k_loop_simd " , &timeout_k_loop_simd);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
log_dsp(file_name, "k_loop_simd ",&timeoutcnt, timeout_k_loop_simd,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_loop_simd): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
ev2.~Wrapper();
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
///////////////k_loop_simd end ////////////
///////////////pipeline, k_loop_db,k_loop_simd_db,k_loop_simd_db_extc end start////////////
// set the double buffer pipeline to at least 8 times
if (BLOCK_HEIGHT > (ROWS / NUMCOMPUNITS / 8))
BLOCK_HEIGHT = (ROWS / NUMCOMPUNITS / 8) + 1;
if (BLOCK_HEIGHT > 0)
{
Event ev3;
timeout_str timeout_k_loop_db;
ocl_event_times_tst_init( &timeout_k_loop_db, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev3);
ev3.wait();
ocl_event_times_tst_1(ev3, "k_loop_db " , &timeout_k_loop_db);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
log_dsp(file_name, "k_loop_db ",&timeoutcnt, timeout_k_loop_db,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_loop_db): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
ev3.~Wrapper();
//////////////////////////////////////////
Event ev4;
timeout_str timeout_k_loop_simd_db;
ocl_event_times_tst_init( &timeout_k_loop_simd_db, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev4);
ev4.wait();
ocl_event_times_tst_1(ev4, "k_loop_simd_db " , &timeout_k_loop_simd_db);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
log_dsp(file_name, "k_loop_simd_db ",&timeoutcnt, timeout_k_loop_simd_db,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_loop_simd_db): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
ev4.~Wrapper();
//////////////////////////////////////////
Event ev5;
timeout_str timeout_k_loop_simd_db_extc;
ocl_event_times_tst_init( &timeout_k_loop_simd_db_extc, timeout_us);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd_db_extc, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev5);
ev5.wait();
ocl_event_times_tst_1(ev5, "k_loop_simd_db_extc " , &timeout_k_loop_simd_db_extc);
clock_gettime(CLOCK_MONOTONIC, &t1);
dspruntime(&dspruntime_us,Q,bufdspruntime_us);
//printf("++++++dspruntime_us=%f us \n",dspruntime_us,mallinfo());
log_dsp(file_name, "k_loop_simd_db_extc ",&timeoutcnt, timeout_k_loop_simd_db_extc,us_diff(t0, t1),dspruntime_us,mallinfo());
if(us_diff(t0, t1)>timeout_us)
printf("Elapsed (k_loop_simd_db_extc): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
ev5.~Wrapper();
}
//ev.release();
//ev.Event;
//ev.~Wrapper();
}
}
catch (Error err)
{
cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
<< ocl_decode_error(err.err()) << ")" << endl;
}
if (num_errors != 0)
{
cout << "Failed with " << num_errors << " errors" << endl;
RETURN(-1);
} else
{
cout << "Pass!" << endl;
RETURN(0);
}
free(pGolden);
}
int TI_tst(int argc, char *argv[]){
struct timespec t0, t1;
int num_errors = 0;
int input_numcompunits = 0;
if (argc > 1) input_numcompunits = atoi(argv[1]); // valid: 1, 2, 4, 8
Context context (CL_DEVICE_TYPE_ACCELERATOR);
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue Q(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
int NUMCOMPUNITS;
cl_ulong LOCALMEMSIZE;
devices[0].getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &NUMCOMPUNITS);
devices[0].getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &LOCALMEMSIZE);
if (input_numcompunits > 0 && NUMCOMPUNITS > input_numcompunits)
NUMCOMPUNITS = input_numcompunits;
printf("++NUMCOMPUNITS=%d,\n",NUMCOMPUNITS);
Program::Binaries binary(1, make_pair(ti_kernels_dsp_bin,
sizeof(ti_kernels_dsp_bin)));
Program program = Program(context, devices, binary);
program.build(devices);
// The OpenCL runtime will lazily load the device program upon the first
// enqueue of a kernel from the program, so the elapsed time overall from
// the first enqueue will be longer to account for the loading of the
// program. To remove program loading overhead from kernel performance,
// enqueue a null kernel before running other kernels.
Kernel kernel(program, "null");
KernelFunctor null = kernel.bind(Q, NDRange(1), NDRange(1));
clock_gettime(CLOCK_MONOTONIC, &t0);
null().wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (loading program): %d usecs\n", us_diff(t0, t1));
// Prepare testing data, compute golden output on host
int bufSize = COLS * ROWS * sizeof(float);
Buffer bufInput( context, CL_MEM_READ_ONLY, bufSize);
Buffer bufOutput(context, CL_MEM_WRITE_ONLY, bufSize);
Buffer bufFilter(context, CL_MEM_READ_ONLY, FILTERSIZE * sizeof(float));
#ifndef _TI_RTOS
float *pGolden = (float *) malloc(bufSize);
#else
float *pGolden = (float *) __malloc_ddr(bufSize);
#endif
if (pGolden == NULL)
{
printf("Failed to allocate memory for golden results\n");
exit(0);
}
float *pInput = (float *) Q.enqueueMapBuffer(bufInput, CL_TRUE,
CL_MAP_WRITE, 0, bufSize);
int VALRANGE = (1 << COLORDEPTH);
for (int i = 0; i < COLS * ROWS; i++)
pInput[i] = (float) (rand() % VALRANGE);
srand(time(NULL));
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int j = 0; j < ROWS; j++)
for (int i = 0; i < COLS; i++)
{
float left_2 = pInput[j * COLS + (i-2 < 0 ? 0 : i-2)];
float left_1 = pInput[j * COLS + (i-1 < 0 ? 0 : i-1)];
float self = pInput[j * COLS + i];
float right_1 = pInput[j * COLS + (i+1 >= COLS ? COLS-1 : i+1)];
float right_2 = pInput[j * COLS + (i+2 >= COLS ? COLS-1 : i+2)];
pGolden[j*COLS + i] = left_2 * FILTER[0] + left_1 * FILTER[1]
+ self * FILTER[2]
+ right_1 * FILTER[3] + right_2 * FILTER[4];
}
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (host_compute): %d usecs\n", us_diff(t0, t1));
Q.enqueueUnmapMemObject(bufInput, pInput);
Q.finish();
Q.enqueueWriteBuffer(bufFilter, CL_TRUE, 0, FILTERSIZE * sizeof(float),
FILTER);
try
{
while(1){
Event ev;
// straight-forward kernel (unoptimized, baseline)
Kernel k_base(program, "k_conv1d_5x1");
k_base.setArg(0, bufInput);
k_base.setArg(1, bufOutput);
k_base.setArg(2, bufFilter);
k_base.setArg(3, COLS);
k_base.setArg(4, ROWS);
k_base.setArg(5, COLS);
k_base.setArg(6, COLS);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_base, NullRange, NDRange(COLS, ROWS),
NDRange(COLS, ROWS/NUMCOMPUNITS), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_baseline): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
Kernel k_loop(program, "k_loop");
k_loop.setArg(0, bufInput);
k_loop.setArg(1, bufOutput);
k_loop.setArg(2, bufFilter);
k_loop.setArg(3, COLS);
k_loop.setArg(4, ROWS);
k_loop.setArg(5, COLS);
k_loop.setArg(6, COLS);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_loop): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
Kernel k_loop_simd(program, "k_loop_simd");
k_loop_simd.setArg(0, bufInput);
k_loop_simd.setArg(1, bufOutput);
k_loop_simd.setArg(2, bufFilter);
k_loop_simd.setArg(3, COLS);
k_loop_simd.setArg(4, ROWS);
k_loop_simd.setArg(5, COLS);
k_loop_simd.setArg(6, COLS);
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_loop_simd): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
int BLOCK_HEIGHT = LOCALMEMSIZE / (2 * 2 * COLS * sizeof(float));
// set the double buffer pipeline to at least 8 times
if (BLOCK_HEIGHT > (ROWS / NUMCOMPUNITS / 8))
BLOCK_HEIGHT = (ROWS / NUMCOMPUNITS / 8) + 1;
if (BLOCK_HEIGHT > 0)
{
Kernel k_loop_db(program, "k_loop_db");
k_loop_db.setArg(0, bufInput);
k_loop_db.setArg(1, bufOutput);
k_loop_db.setArg(2, bufFilter);
k_loop_db.setArg(3, COLS);
k_loop_db.setArg(4, ROWS);
k_loop_db.setArg(5, COLS);
k_loop_db.setArg(6, COLS);
k_loop_db.setArg(7, BLOCK_HEIGHT);
k_loop_db.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_db.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_loop_db): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
Kernel k_loop_simd_db(program, "k_loop_simd_db");
k_loop_simd_db.setArg(0, bufInput);
k_loop_simd_db.setArg(1, bufOutput);
k_loop_simd_db.setArg(2, bufFilter);
k_loop_simd_db.setArg(3, COLS);
k_loop_simd_db.setArg(4, ROWS);
k_loop_simd_db.setArg(5, COLS);
k_loop_simd_db.setArg(6, COLS);
k_loop_simd_db.setArg(7, BLOCK_HEIGHT);
k_loop_simd_db.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_loop_simd_db): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
Kernel k_loop_simd_db_extc(program, "k_loop_simd_db_extc");
k_loop_simd_db_extc.setArg(0, bufInput);
k_loop_simd_db_extc.setArg(1, bufOutput);
k_loop_simd_db_extc.setArg(2, bufFilter);
k_loop_simd_db_extc.setArg(3, COLS);
k_loop_simd_db_extc.setArg(4, ROWS);
k_loop_simd_db_extc.setArg(5, COLS);
k_loop_simd_db_extc.setArg(6, COLS);
k_loop_simd_db_extc.setArg(7, BLOCK_HEIGHT);
k_loop_simd_db_extc.setArg(8, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
k_loop_simd_db_extc.setArg(9, __local(BLOCK_HEIGHT*2*COLS*sizeof(float)));
clock_gettime(CLOCK_MONOTONIC, &t0);
Q.enqueueNDRangeKernel(k_loop_simd_db_extc, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
ev.wait();
clock_gettime(CLOCK_MONOTONIC, &t1);
printf("Elapsed (k_loop_simd_db_extc): %d usecs\n", us_diff(t0, t1));
num_errors += VerifyResults(Q, bufOutput, pGolden, COLS, ROWS);
}
mi_1 = mallinfo();
printf("\n mallinfo :%d \n",mi_1.uordblks); //print memory overflow
}
#ifndef _TI_RTOS
free(pGolden);
#else
__free_ddr(pGolden);
#endif
}
catch (Error err)
{
cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
<< ocl_decode_error(err.err()) << ")" << endl;
}
if (num_errors != 0)
{
cout << "Failed with " << num_errors << " errors" << endl;
RETURN(-1);
} else
{
cout << "Pass!" << endl;
RETURN(0);
}
}
/*
int bufSize = COLS * ROWS * sizeof(float);
Buffer bufInput( context, CL_MEM_READ_ONLY, bufSize);
Buffer bufOutput(context, CL_MEM_WRITE_ONLY, bufSize);
Buffer bufFilter(context, CL_MEM_READ_ONLY, FILTERSIZE * sizeof(float));
float *pInput = (float *) Q.enqueueMapBuffer(bufInput, CL_TRUE,
CL_MAP_WRITE, 0, bufSize);
for (int i = 0; i < COLS * ROWS; i++)
pInput[i] = (float) (rand() % VALRANGE);
Q.enqueueUnmapMemObject(bufInput, pInput);
Q.finish();
float *pOutput = (float *) Q.enqueueMapBuffer(bufOutput, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0, cols*rows*sizeof(float));
Event ev;
memset(pOutput, 0, cols*rows*sizeof(float));
Q.enqueueUnmapMemObject(bufOutput, pOutput, NULL, &ev);
ev.wait();
*/
/*
Q.enqueueNDRangeKernel(k_base, NullRange,
NDRange(COLS, ROWS), NDRange(COLS, ROWS/NUMCOMPUNITS), NULL, &ev);
__kernel void k_conv1d_5x1(),
Q.enqueueNDRangeKernel(k_loop, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev);
__kernel void k_loop(),
Q.enqueueNDRangeKernel(k_loop_simd, NullRange, NDRange(ROWS),
NDRange(ROWS/NUMCOMPUNITS), NULL, &ev);
__kernel void k_loop_simd(),
Q.enqueueNDRangeKernel(k_loop_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_db();
Q.enqueueNDRangeKernel(k_loop_simd_db, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db(),
Q.enqueueNDRangeKernel(k_loop_simd_db_extc, NullRange,
NDRange(NUMCOMPUNITS), NDRange(1), NULL, &ev);
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db_extc()
*/
/*
__kernel void null() { }
// 1D convolution applied to each row of an image, filter size 5x1
//k_base
__kernel void k_conv1d_5x1(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int col = get_global_id(0);
int row = get_global_id(1);
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
// if symmetric filter, 2 multiplications can be optimized away
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
//k_loop
__kernel void k_loop(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int row = get_global_id(0);
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = input[row * inPitch + col-2];
float left_1 = input[row * inPitch + col-1];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + col+1];
float right_2 = input[row * inPitch + col+2];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
}
//k_loop_simd
__kernel void k_loop_simd(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int row = get_global_id(0);
int col;
// _nassert(input % 8 == 0);
// _nassert(output % 8 == 0);
// _nassert(inPitch % 2 == 0);
// _nassert(outPitch % 2 == 0);
#if 0
// first version: SIMDize the computation
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_left_2 = *((float2 *) &input[row * inPitch + col-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + col]);
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
(v2_left_2 + v2_right_2) * filter[0]
+ (v2_left_1 + v2_right_1) * filter[1]
+ v2_self * filter[2];
}
#else
// second version: pipeline the memory loads
float2 v2_left_2 = *((float2 *) &input[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
v2_left_2 * filter[0] + v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3] + v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
#endif
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
}
//k_loop_db
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = in_buf0[row * inPitch + col-2];
float left_1 = in_buf0[row * inPitch + col-1];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + col+1];
float right_2 = in_buf0[row * inPitch + col+2];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
}
//k_loop_simd_db
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
float2 v2_left_2 = *((float2 *) &in_buf0[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &in_buf0[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &in_buf0[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &out_buf0[row * outPitch + col]) =
v2_left_2 * filter[0]
+ v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3]
+ v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
}
extern void c_loop_simd_db_extc(int row_begin, int row_end,
__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
);
//k_loop_simd_db_extc
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db_extc(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
c_loop_simd_db_extc(row_begin, row_end,
input, output, filter,
COLS, ROWS, inPitch, outPitch,
BLOCK_HEIGHT, lInput, lOutput);
}
*/
kernel.cl:
/******************************************************************************
* Copyright (c) 2017, Texas Instruments Incorporated - http://www.ti.com/
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Texas Instruments Incorporated nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
//#define TI_VERSION
#define testdsptime
//#define testdsptime_1
#ifdef testdsptime
typedef struct dspRuntime_
{
uint64_t start_time_;
uint64_t end_time_;
uint64_t overhead_;
uint64_t cyclecount_;
}dspRuntime;
#endif
#ifndef TI_VERSION
__kernel void null() { }
// 1D convolution applied to each row of an image, filter size 5x1
//k_base
__kernel void k_conv1d_5x1(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
/*
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
*/
#endif
int col = get_global_id(0);
int row = get_global_id(1);
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
// if symmetric filter, 2 multiplications can be optimized away
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
#ifdef testdsptime
float time_us=0;
/*
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//if(col==0)
// printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
*/
dspruntime_us[0]=time_us;
#endif
}
//k_loop
__kernel void k_loop(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
/*
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
*/
#endif
int row = get_global_id(0);
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = input[row * inPitch + col-2];
float left_1 = input[row * inPitch + col-1];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + col+1];
float right_2 = input[row * inPitch + col+2];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
#ifdef testdsptime
float time_us=0;
/*
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
*/
dspruntime_us[0]=time_us;
#endif
}
//k_loop_simd
__kernel void k_loop_simd(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
/*
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
*/
#endif
int row = get_global_id(0);
int col;
// _nassert(input % 8 == 0);
// _nassert(output % 8 == 0);
// _nassert(inPitch % 2 == 0);
// _nassert(outPitch % 2 == 0);
#if 0
// first version: SIMDize the computation
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_left_2 = *((float2 *) &input[row * inPitch + col-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + col]);
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
(v2_left_2 + v2_right_2) * filter[0]
+ (v2_left_1 + v2_right_1) * filter[1]
+ v2_self * filter[2];
}
#else
// second version: pipeline the memory loads
float2 v2_left_2 = *((float2 *) &input[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
v2_left_2 * filter[0] + v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3] + v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
#endif
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
#ifdef testdsptime
float time_us=0;
/*
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
*/
dspruntime_us[0]=time_us;
#endif
}
//k_loop_db
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput, // double buffer
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
#endif
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = in_buf0[row * inPitch + col-2];
float left_1 = in_buf0[row * inPitch + col-1];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + col+1];
float right_2 = in_buf0[row * inPitch + col+2];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
#ifdef testdsptime
float time_us=0;
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
dspruntime_us[0]=time_us;
#endif
}
//k_loop_simd_db
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput, // double buffer
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
#endif
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
float2 v2_left_2 = *((float2 *) &in_buf0[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &in_buf0[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &in_buf0[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &out_buf0[row * outPitch + col]) =
v2_left_2 * filter[0]
+ v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3]
+ v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
#ifdef testdsptime
float time_us=0;
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
dspruntime_us[0]=time_us;
#endif
}
extern void c_loop_simd_db_extc(int row_begin, int row_end,
__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
);
//k_loop_simd_db_extc
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db_extc(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput, // double buffer
__global float *restrict dspruntime_us
)
{
#ifdef testdsptime
dspRuntime dspRuntimeObj={0,0,0,0};
init_portion_tst(&dspRuntimeObj);
entry_hook_tst(&dspRuntimeObj);
#endif
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
c_loop_simd_db_extc(row_begin, row_end,
input, output, filter,
COLS, ROWS, inPitch, outPitch,
BLOCK_HEIGHT, lInput, lOutput);
#ifdef testdsptime
float time_us=0;
uint64_t Cyclecount=0;
exit_hook_tst(&time_us,&Cyclecount,&dspRuntimeObj);
//printf("++kernel time_us=%llf us \n",time_us);
//printf("++kernel The code section took: %lld CPU cycles\n", Cyclecount);
dspruntime_us[0]=time_us;
#endif
}
#else
__kernel void null() { }
// 1D convolution applied to each row of an image, filter size 5x1
__kernel void k_conv1d_5x1(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int col = get_global_id(0);
int row = get_global_id(1);
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
// if symmetric filter, 2 multiplications can be optimized away
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
__kernel void k_loop(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int row = get_global_id(0);
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = input[row * inPitch + col-2];
float left_1 = input[row * inPitch + col-1];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + col+1];
float right_2 = input[row * inPitch + col+2];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
}
__kernel void k_loop_simd(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch
)
{
int row = get_global_id(0);
int col;
// _nassert(input % 8 == 0);
// _nassert(output % 8 == 0);
// _nassert(inPitch % 2 == 0);
// _nassert(outPitch % 2 == 0);
#if 0
// first version: SIMDize the computation
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_left_2 = *((float2 *) &input[row * inPitch + col-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + col]);
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
(v2_left_2 + v2_right_2) * filter[0]
+ (v2_left_1 + v2_right_1) * filter[1]
+ v2_self * filter[2];
}
#else
// second version: pipeline the memory loads
float2 v2_left_2 = *((float2 *) &input[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &input[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &input[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &output[row * outPitch + col]) =
v2_left_2 * filter[0] + v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3] + v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
#endif
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = input[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = input[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = input[row * inPitch + col];
float right_1 = input[row * inPitch + (col+1 >= COLS ? COLS-1 : col+1)];
float right_2 = input[row * inPitch + (col+2 >= COLS ? COLS-1 : col+2)];
output[row * outPitch + col] = left_2 * filter[0] + left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3] + right_2 * filter[4];
}
}
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
// no boundary checks: col: [2, COLS-3]
for (col = 2; col < COLS-2; col++)
{
float left_2 = in_buf0[row * inPitch + col-2];
float left_1 = in_buf0[row * inPitch + col-1];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + col+1];
float right_2 = in_buf0[row * inPitch + col+2];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int boundaries[4] = { 0, 1, COLS-2, COLS-1 };
for (int i = 0; i < 4; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
}
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int b_row, block_height, next_b_row, next_block_height;
local float *in_buf0 = lInput;
local float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
local float *out_buf0 = lOutput;
local float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
event_t ev_in0, ev_in1, ev_out0, ev_out1;
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
ev_in0 = async_work_group_copy(in_buf0, &input[row_begin * inPitch],
block_height * inPitch, 0);
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// prefetch next block
if (! last_block)
ev_in1 = async_work_group_copy(in_buf1, input + next_b_row * inPitch,
next_block_height * inPitch, 0);
// wait for prefecthed block to finish
wait_group_events(1, &ev_in0);
ev_in0 = ev_in1;
// for each row in the block: compute
for (int row = 0; row < block_height; row++)
{
int col;
float2 v2_left_2 = *((float2 *) &in_buf0[row * inPitch + 2-2]);
float2 v2_self = *((float2 *) &in_buf0[row * inPitch + 2]);
float2 v2_left_1 = (float2) (v2_left_2.s1, v2_self.s0);
for (col = 2; col < COLS-2-1; col+=2)
{
float2 v2_right_2 = *((float2 *) &in_buf0[row * inPitch + col+2]);
float2 v2_right_1 = (float2) (v2_self.s1, v2_right_2.s0);
* ((float2 *) &out_buf0[row * outPitch + col]) =
v2_left_2 * filter[0]
+ v2_left_1 * filter[1]
+ v2_self * filter[2]
+ v2_right_1 * filter[3]
+ v2_right_2 * filter[4];
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
for (int i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// store block output back to output image
ev_out1 = async_work_group_copy(output + b_row * outPitch, out_buf0,
block_height * outPitch, 0);
// wait for previous store to finish
if (! first_block) wait_group_events(1, &ev_out0);
ev_out0 = ev_out1;
// swap buffers for next block
if (! last_block)
{
local float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
wait_group_events(1, &ev_out0);
}
extern void c_loop_simd_db_extc(int row_begin, int row_end,
__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
);
__kernel __attribute__((reqd_work_group_size(1,1,1)))
void k_loop_simd_db_extc(__global float *restrict input,
__global float *restrict output,
__global float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
__local float *restrict lInput, // double buffer
__local float *restrict lOutput // double buffer
)
{
// get the rows that this workgroup needs to process
int gid = get_global_id(0);
int gsz = get_global_size(0);
int row_begin = gid * (ROWS / gsz);
int row_end = (gid + 1) * (ROWS / gsz); // exclusive
if (gid == gsz - 1) row_end = ROWS; // exclusive
c_loop_simd_db_extc(row_begin, row_end,
input, output, filter,
COLS, ROWS, inPitch, outPitch,
BLOCK_HEIGHT, lInput, lOutput);
}
#endif
k_extc.c
/******************************************************************************
* Copyright (c) 2017, Texas Instruments Incorporated - http://www.ti.com/
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Texas Instruments Incorporated nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdbool.h>
#include <c6x.h>
#include "dsp_c.h"
#include "dsp_edmamgr.h"
void c_loop_simd_db_extc(int row_begin, int row_end,
float *restrict input,
float *restrict output,
float *restrict filter,
int COLS,
int ROWS,
int inPitch,
int outPitch,
int BLOCK_HEIGHT,
float *restrict lInput, // double buffer
float *restrict lOutput // double buffer
)
{
// partition rows into chunks, prefect next chunk, compute this chunk
bool first_block, last_block;
int row, b_row, block_height, next_b_row, next_block_height;
float *in_buf0 = lInput;
float *in_buf1 = lInput + BLOCK_HEIGHT * inPitch;
float *out_buf0 = lOutput;
float *out_buf1 = lOutput + BLOCK_HEIGHT * outPitch;
EdmaMgr_Handle chan_in, chan_out;
chan_in = __ocl_EdmaMgr_alloc_intrakernel(1);
chan_out = __ocl_EdmaMgr_alloc_intrakernel(1);
if (! chan_in || ! chan_out)
{
printf("Failed to allocate edma handle.\n");
return;
}
// fetch first block
block_height = (row_begin + BLOCK_HEIGHT >= row_end) ? row_end - row_begin
: BLOCK_HEIGHT;
EdmaMgr_copy2D1D(chan_in, &input[row_begin * inPitch], in_buf0,
COLS * sizeof(float), block_height,
inPitch *sizeof(float));
// for each block
for (b_row = row_begin; b_row < row_end; b_row += BLOCK_HEIGHT)
{
first_block = (b_row == row_begin);
last_block = (b_row + BLOCK_HEIGHT >= row_end);
block_height = (b_row + BLOCK_HEIGHT >= row_end) ? row_end - b_row
: BLOCK_HEIGHT;
next_b_row = b_row + block_height;
next_block_height = (next_b_row + BLOCK_HEIGHT > row_end)
? row_end - next_b_row : BLOCK_HEIGHT;
// wait for prefecthed block to finish
EdmaMgr_wait(chan_in);
// prefetch next block
if (! last_block)
EdmaMgr_copy2D1D(chan_in, input + next_b_row * inPitch, in_buf1,
COLS * sizeof(float), next_block_height,
inPitch *sizeof(float));
// for each row in the block: compute
for (row = 0; row < block_height; row++)
{
int col;
__float2_t v2_left_2 = _mem8_f2((void *) &in_buf0[row * inPitch + 2-2]);
__float2_t v2_self = _mem8_f2((void *) &in_buf0[row * inPitch + 2]);
__float2_t v2_left_1 = _ftof2(_lof2(v2_self), _hif2(v2_left_2));
__float2_t vf0 = _ftof2(filter[0], filter[0]);
__float2_t vf1 = _ftof2(filter[1], filter[1]);
__float2_t vf2 = _ftof2(filter[2], filter[2]);
__float2_t vf3 = _ftof2(filter[3], filter[3]);
__float2_t vf4 = _ftof2(filter[4], filter[4]);
for (col = 2; col < COLS-2-1; col+=2)
{
__float2_t v2_right_2 = _mem8_f2((void *) &in_buf0[row*inPitch+col+2]);
__float2_t v2_right_1 = _ftof2(_lof2(v2_right_2), _hif2(v2_self));
_amem8_f2((void *) &out_buf0[row * outPitch + col]) =
_daddsp(
_daddsp(
_daddsp(
_daddsp(_dmpysp(v2_left_2, vf0),
_dmpysp(v2_left_1, vf1)),
_dmpysp(v2_self, vf2)),
_dmpysp(v2_right_1, vf3)),
_dmpysp(v2_right_2, vf4));
v2_left_2 = v2_self;
v2_left_1 = v2_right_1;
v2_self = v2_right_2;
}
// boundary conditions
// Alternatively, user can choose to pad the input data
int extra_col = (col == COLS-3 ? 1 : 0);
int boundaries[5] = { 0, 1, COLS-2, COLS-1, COLS-3 };
int i;
for (i = 0; i < 4 + extra_col; i++)
{
col = boundaries[i];
float left_2 = in_buf0[row * inPitch + (col-2 < 0 ? 0 : col-2)];
float left_1 = in_buf0[row * inPitch + (col-1 < 0 ? 0 : col-1)];
float self = in_buf0[row * inPitch + col];
float right_1 = in_buf0[row * inPitch + (col+1 >= COLS ? COLS-1:col+1)];
float right_2 = in_buf0[row * inPitch + (col+2 >= COLS ? COLS-1:col+2)];
out_buf0[row * outPitch + col] = left_2 * filter[0]
+ left_1 * filter[1]
+ self * filter[2]
+ right_1 * filter[3]
+ right_2 * filter[4];
}
}
// wait for previous store to finish
if (! first_block) EdmaMgr_wait(chan_out);
// store block output back to output image
EdmaMgr_copy1D2D(chan_out, out_buf0, output + b_row * outPitch,
COLS * sizeof(float), block_height,
outPitch * sizeof(float));
// swap buffers for next block
if (! last_block)
{
float *tmp = in_buf0; in_buf0 = in_buf1; in_buf1 = tmp;
tmp = out_buf0; out_buf0 = out_buf1; out_buf1 = tmp;
}
}
// wait for last block store to finish
EdmaMgr_wait(chan_out);
EdmaMgr_free(chan_in);
EdmaMgr_free(chan_out);
}
dsp_cycles.h
#ifndef DSP_CYCLES_H_
#define DSP_CYCLES_H_
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> // defines uint64_t
#include <c6x.h> // defines _itoll, TSCH, TSCL
#define DSP_CORECLOCK (750*1000000)
// In the variable declaration portion of the code:
uint64_t start_time, end_time, overhead, cyclecount;
typedef struct dspRuntime_
{
uint64_t start_time_;
uint64_t end_time_;
uint64_t overhead_;
uint64_t cyclecount_;
}dspRuntime;
// Code to be profiled
void test_fuctioncode();
// In the initialization portion of the code:
void init_portion();
void entry_hook();
void exit_hook(float *time_us,uint64_t *Cyclecount);
void init_portion_tst(dspRuntime *dspRuntimeObj);
void entry_hook_tst(dspRuntime *dspRuntimeObj);
void exit_hook_tst(float *time_us,uint64_t *Cyclecount,dspRuntime *dspRuntimeObj);
#endif
dsp_cycles.c
#include <stdio.h>
#include <stdint.h> // defines uint64_t
#include <c6x.h> // defines _itoll, TSCH, TSCL
#include "DSP_cycles.h"
// In the variable declaration portion of the code:
extern uint64_t start_time, end_time, overhead, cyclecount;
// Code to be profiled
void test_fuctioncode()
{
start_time = _itoll(TSCH, TSCL);
//function_or_code_here();
end_time = _itoll(TSCH, TSCL);
cyclecount = end_time-start_time-overhead;
printf("The code section took: %lld CPU cycles\n", cyclecount);
}
// In the initialization portion of the code:
void init_portion()
{
TSCL = 0; //enable TSC
start_time = _itoll(TSCH, TSCL);
end_time = _itoll(TSCH, TSCL);
overhead = end_time-start_time; //Calculating the overhead of the method.
}
void entry_hook()
{
start_time = _itoll(TSCH, TSCL);
}
void exit_hook(float *time_us,uint64_t *Cyclecount)
{
end_time = _itoll(TSCH, TSCL);
cyclecount = end_time-start_time-overhead;
//printf("The code section took: %lld CPU cycles\n", cyclecount);
//float ms=(float)cyclecount/DSP_CORECLOCK*1000.0;
//float us=(float)cyclecount/(DSP_CORECLOCK/1000000.0);
time_us[0]=(float)cyclecount/(DSP_CORECLOCK/1000000.0);
Cyclecount[0]=cyclecount;
//printf("The code section took: %11f us\n", us);
}
// In the initialization portion of the code:
void init_portion_tst(dspRuntime *dspRuntimeObj)
{
TSCL = 0; //enable TSC
dspRuntimeObj->start_time_ = _itoll(TSCH, TSCL);
dspRuntimeObj->end_time_ = _itoll(TSCH, TSCL);
dspRuntimeObj->overhead_ = dspRuntimeObj->end_time_- dspRuntimeObj->start_time_; //Calculating the overhead of the method.
}
void entry_hook_tst(dspRuntime *dspRuntimeObj)
{
dspRuntimeObj->start_time_ = _itoll(TSCH, TSCL);
}
void exit_hook_tst(float *time_us,uint64_t *Cyclecount,dspRuntime *dspRuntimeObj)
{
dspRuntimeObj->end_time_ = _itoll(TSCH, TSCL);
dspRuntimeObj->cyclecount_ = dspRuntimeObj->end_time_- dspRuntimeObj->start_time_
-dspRuntimeObj->overhead_;
//printf("The code section took: %lld CPU cycles\n", dspRuntimeObj->cyclecount_);
//float ms=(float)dspRuntimeObj->cyclecount_/DSP_CORECLOCK*1000.0;
//float us=(float)dspRuntimeObj->cyclecount_/(DSP_CORECLOCK/1000000.0);
time_us[0]=(float)dspRuntimeObj->cyclecount_/(DSP_CORECLOCK/1000000.0);
Cyclecount[0]=dspRuntimeObj->cyclecount_;
//printf("The code section took: %11f us\n", us);
}
Makefile
EXE = conv1d_tst
CXXFLAGS = -O3
CL6XFLAGS = -o3 -mw --symdebug:none
include ../make.inc
$(EXE): main.o
@$(CXX) $(CXXFLAGS) main.o $(LDFLAGS) $(LIBS) -lrt -o $@
main.o: ti_kernels.dsp_h
ti_kernels.dsp_h: DSP_cycles.obj k_extc.obj
log
************************
************************
DSP running log:
NUMCOMPUNITS=2,timeout_us=20000 us, COLS=1920 ,ROWS=1080,
kernelName=k_base ,QueuetoSubmit=1 us, SubmittoStart=8389 us, StarttoEnd=18634 us, dspruntime_us=0.000000 us, max_time_out=27024 us, timeout_us=20000 us, timeus_linux=27225 us, cnt_time_out=1, mallinfo=230096,
kernelName=k_base ,QueuetoSubmit=1 us, SubmittoStart=21 us, StarttoEnd=20956 us, dspruntime_us=0.000000 us, max_time_out=20978 us, timeout_us=20000 us, timeus_linux=22053 us, cnt_time_out=2, mallinfo=231568,