Tool/software: Linux
Hi TI,
I am doing OpenCV+DSP acceleration on AM5718 board,SDK version is ti-processor-sdk-linux-rt-am57xx-evm-04.01.00.06.
According to the docs in http://software-dl.ti.com/processor-sdk-linux/esd/docs/latest/linux/Foundational_Components_OpenCV.html#creating-opencl-c-kernel-optimized-for-c66-core
__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f, -0.812999725f, 1.5959997177f };
__kernel void yuv422_to_rgb_opencl_1WorkItem(__global uchar* srcptr, __global uchar* dstptr)
{
__global uchar* src;
__global uchar* dst;
int clk_start, clk_end;
clk_start = __clock();
for(int x=0; x < 480; x++){
for(int y=0; y < 320; y++ ){
src = srcptr + (x*640*2+(y << 2));
dst = dstptr + (x*640*3 + ((y << 1)*3));
__constant float* coeffs = c_YUV2RGBCoeffs_420;
int load_src = *((int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[1] - 128;
float V = vec_src[3] - 128;
float y00 = max(0.f, vec_src[0] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[2] - 16.f) * coeffs[0];
float ruv = ((coeffs[4]*V)+0.5f);
float guv = (coeffs[3]*V)+((coeffs[2]*U)+0.5f);
float buv = ((coeffs[1]*U)+0.5f);
dst[2] = convert_uchar_sat(y00 + ruv);
dst[1] = convert_uchar_sat(y00 + guv);
dst[0] = convert_uchar_sat(y00 + buv);
dst[5] = convert_uchar_sat(y01 + ruv);
dst[4] = convert_uchar_sat(y01 + guv);
dst[3] = convert_uchar_sat(y01 + buv);
}
}
clk_end = __clock();
printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
/////////////////66549941
}
2. whit EDMA opencl kernel:
__kernel void yuv422_to_rgb_opencl_1WorkItem_EDMA(__global uchar* srcptr, __global uchar* dstptr)
{
int cols = 640;
int rows = 480;
uchar * y_ptr[LINES_CACHED];
uchar *ycurr_ptr, *dest_ptr, *dst, *src;
int rd_idx, start_rd_idx, fetch_rd_idx;
int r, g, b, cr, cg, cb, y1, y2;
EdmaMgr_Handle evIN = EdmaMgr_alloc(LINES_CACHED);
local uchar img_lines[LINES_CACHED+1][MAX_LINE_SIZE];
int i, kk;
int clk_start, clk_end;
clk_start = __clock();
if (!evIN) { printf("Failed to alloc edmaIN1 handle.\n"); return; }
dest_ptr = (uchar *)dstptr;
for(i = 0; i < (LINES_CACHED + 1); i ++){
memset ((void *)img_lines[i], 0, MAX_LINE_SIZE);
}
for(i = 1; i < LINES_CACHED; i++){
EdmaMgr_copy1D1D(evIN, (void *)(srcptr), (void *)(img_lines[i]), cols*2);
}
fetch_rd_idx = cols*2;
start_rd_idx = 0;
for (int x = 0; x < rows; x++){
EdmaMgr_wait(evIN);
rd_idx = start_rd_idx;
for(kk = 0; kk < LINES_CACHED; kk ++){
y_ptr[kk] = (uchar *)img_lines[rd_idx];
rd_idx = (rd_idx + 1) & LINES_CACHED;
}
start_rd_idx = (start_rd_idx + 1) & LINES_CACHED;
EdmaMgr_copyFast(evIN, (void*)(srcptr + fetch_rd_idx), (void*)(img_lines[rd_idx]));
fetch_rd_idx += cols*2;
ycurr_ptr = (uchar *)y_ptr[1];
for(int y=0; y<cols/2; y++){
src = ycurr_ptr + (y << 2);
dst = dest_ptr + (x*640*3 + ((y << 1)*3));
__constant float* coeffs = c_YUV2RGBCoeffs_420;
int load_src = *((int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[1] - 128;
float V = vec_src[3] - 128;
float y00 = max(0.f, vec_src[0] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[2] - 16.f) * coeffs[0];
float ruv = ((coeffs[4]*V)+0.5f);
float guv = (coeffs[3]*V)+((coeffs[2]*U)+0.5f);
float buv = ((coeffs[1]*U)+0.5f);
dst[2] = convert_uchar_sat(y00 + ruv);
dst[1] = convert_uchar_sat(y00 + guv);
dst[0] = convert_uchar_sat(y00 + buv);
dst[5] = convert_uchar_sat(y01 + ruv);
dst[4] = convert_uchar_sat(y01 + guv);
dst[3] = convert_uchar_sat(y01 + buv);
}
}
EdmaMgr_wait(evIN);
EdmaMgr_free(evIN);
clk_end = __clock();
printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
///////////////////66420487
}
3. The result show both of their execution time are 66420487.
4. My question is: why EDMA does not have a better performance for DSP acceleration?
Thanks.