Linux/PROCESSOR-SDK-AM57X: OpenCL+DSP acceleration using EdmaMgr has no effect

user6010539

Tool/software: Linux

Hi TI,

I am doing OpenCV+DSP acceleration on AM5718 board，SDK version is ti-processor-sdk-linux-rt-am57xx-evm-04.01.00.06.

According to the docs in http://software-dl.ti.com/processor-sdk-linux/esd/docs/latest/linux/Foundational_Components_OpenCV.html#creating-opencl-c-kernel-optimized-for-c66-core

__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f, -0.812999725f, 1.5959997177f };

__kernel void yuv422_to_rgb_opencl_1WorkItem(__global uchar* srcptr, __global uchar* dstptr)
{
__global uchar* src;
__global uchar* dst;

int clk_start, clk_end;
clk_start = __clock();

for(int x=0; x < 480; x++){
for(int y=0; y < 320; y++ ){
src = srcptr + (x*640*2+(y << 2));
dst = dstptr + (x*640*3 + ((y << 1)*3));

__constant float* coeffs = c_YUV2RGBCoeffs_420;

int load_src = *((int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[1] - 128;
float V = vec_src[3] - 128;
float y00 = max(0.f, vec_src[0] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[2] - 16.f) * coeffs[0];
float ruv = ((coeffs[4]*V)+0.5f);
float guv = (coeffs[3]*V)+((coeffs[2]*U)+0.5f);
float buv = ((coeffs[1]*U)+0.5f);

dst[2] = convert_uchar_sat(y00 + ruv);
dst[1] = convert_uchar_sat(y00 + guv);
dst[0] = convert_uchar_sat(y00 + buv);
dst[5] = convert_uchar_sat(y01 + ruv);
dst[4] = convert_uchar_sat(y01 + guv);
dst[3] = convert_uchar_sat(y01 + buv);
}
}
clk_end = __clock();
printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
/////////////////66549941
}

2. whit EDMA opencl kernel:

__kernel void yuv422_to_rgb_opencl_1WorkItem_EDMA(__global uchar* srcptr, __global uchar* dstptr)
{
int cols = 640;
int rows = 480;

uchar * y_ptr[LINES_CACHED];
uchar *ycurr_ptr, *dest_ptr, *dst, *src;
int rd_idx, start_rd_idx, fetch_rd_idx;

int r, g, b, cr, cg, cb, y1, y2;

EdmaMgr_Handle evIN = EdmaMgr_alloc(LINES_CACHED);
local uchar img_lines[LINES_CACHED+1][MAX_LINE_SIZE];

int i, kk;

int clk_start, clk_end;
clk_start = __clock();

if (!evIN) { printf("Failed to alloc edmaIN1 handle.\n"); return; }

dest_ptr = (uchar *)dstptr;

for(i = 0; i < (LINES_CACHED + 1); i ++){
memset ((void *)img_lines[i], 0, MAX_LINE_SIZE);
}

for(i = 1; i < LINES_CACHED; i++){
EdmaMgr_copy1D1D(evIN, (void *)(srcptr), (void *)(img_lines[i]), cols*2);
}

fetch_rd_idx = cols*2;
start_rd_idx = 0;

for (int x = 0; x < rows; x++){
EdmaMgr_wait(evIN);
rd_idx = start_rd_idx;
for(kk = 0; kk < LINES_CACHED; kk ++){
y_ptr[kk] = (uchar *)img_lines[rd_idx];
rd_idx = (rd_idx + 1) & LINES_CACHED;
}

start_rd_idx = (start_rd_idx + 1) & LINES_CACHED;
EdmaMgr_copyFast(evIN, (void*)(srcptr + fetch_rd_idx), (void*)(img_lines[rd_idx]));
fetch_rd_idx += cols*2;

ycurr_ptr = (uchar *)y_ptr[1];

for(int y=0; y<cols/2; y++){
src = ycurr_ptr + (y << 2);
dst = dest_ptr + (x*640*3 + ((y << 1)*3));

__constant float* coeffs = c_YUV2RGBCoeffs_420;

clk_end = __clock();
printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
///////////////////66420487
}

3. The result show both of their execution time are 66420487.

4. My question is: why EDMA does not have a better performance for DSP acceleration?

Thanks.

over 6 years ago

0 Rex Chang over 6 years ago

TI__Guru 50170 points

Hi,

This thread seems to be a duplicate of e2e.ti.com/.../798808.
I'll close this one and let's work on the other thread.

Rex

Processors

Processors forum

Linux/PROCESSOR-SDK-AM57X: OpenCL+DSP acceleration using EdmaMgr has no effect