This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Linux/PROCESSOR-SDK-AM57X: OpenCL+DSP acceleration using EdmaMgr has no effect(2)

Part Number: PROCESSOR-SDK-AM57X
Other Parts Discussed in Thread: AM5718

Tool/software: Linux

Hi TI,

I am doing OpenCV+DSP acceleration on AM5718 board,SDK version is ti-processor-sdk-linux-rt-am57xx-evm-04.01.00.06.

According to the docs , I use EDMAmgr for my opencl kernel.

I found the EDMAmgr has no effect.

Below is my opencl kernel code.

1. no EDMA opencl kernel:

__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f, -0.812999725f, 1.5959997177f };


__kernel void yuv422_to_rgb_opencl_1WorkItem(__global uchar* srcptr, __global uchar* dstptr)
{
         __global uchar* src;
         __global uchar* dst;

int clk_start, clk_end;
clk_start = __clock();

for(int x=0; x < 480; x++){
         for(int y=0; y < 320; y++ ){
                  src = srcptr + (x*640*2+(y << 2));
                  dst = dstptr + (x*640*3 + ((y << 1)*3));

__constant float* coeffs = c_YUV2RGBCoeffs_420;

int load_src = *((int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[1] - 128;
float V = vec_src[3] - 128;
float y00 = max(0.f, vec_src[0] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[2] - 16.f) * coeffs[0];
float ruv = ((coeffs[4]*V)+0.5f);
float guv = (coeffs[3]*V)+((coeffs[2]*U)+0.5f);
float buv = ((coeffs[1]*U)+0.5f);

                           dst[2] = convert_uchar_sat(y00 + ruv);
                           dst[1] = convert_uchar_sat(y00 + guv);
                           dst[0] = convert_uchar_sat(y00 + buv);
                           dst[5] = convert_uchar_sat(y01 + ruv);
                           dst[4] = convert_uchar_sat(y01 + guv);
                           dst[3] = convert_uchar_sat(y01 + buv);
                  }
         }
         clk_end = __clock();
         printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
         /////////////////66549941
}

2. whit EDMA opencl kernel:

__kernel void yuv422_to_rgb_opencl_1WorkItem_EDMA(__global uchar* srcptr, __global uchar* dstptr)
{
         int cols = 640;
         int rows = 480;

uchar * y_ptr[LINES_CACHED];
uchar *ycurr_ptr, *dest_ptr, *dst, *src;
int rd_idx, start_rd_idx, fetch_rd_idx;

int r, g, b, cr, cg, cb, y1, y2;

EdmaMgr_Handle evIN = EdmaMgr_alloc(LINES_CACHED);
local uchar img_lines[LINES_CACHED+1][MAX_LINE_SIZE];

int i, kk;

int clk_start, clk_end;
clk_start = __clock();

if (!evIN) { printf("Failed to alloc edmaIN1 handle.\n"); return; }

dest_ptr = (uchar *)dstptr;

for(i = 0; i < (LINES_CACHED + 1); i ++){
         memset ((void *)img_lines[i], 0, MAX_LINE_SIZE);
}

for(i = 1; i < LINES_CACHED; i++){
         EdmaMgr_copy1D1D(evIN, (void *)(srcptr), (void *)(img_lines[i]), cols*2);
}

fetch_rd_idx = cols*2;
start_rd_idx = 0;

for (int x = 0; x < rows; x++){
         EdmaMgr_wait(evIN);
         rd_idx = start_rd_idx;
         for(kk = 0; kk < LINES_CACHED; kk ++){
                  y_ptr[kk] = (uchar *)img_lines[rd_idx];
                  rd_idx = (rd_idx + 1) & LINES_CACHED;
         }

start_rd_idx = (start_rd_idx + 1) & LINES_CACHED;
EdmaMgr_copyFast(evIN, (void*)(srcptr + fetch_rd_idx), (void*)(img_lines[rd_idx]));
fetch_rd_idx += cols*2;

ycurr_ptr = (uchar *)y_ptr[1];

for(int y=0; y<cols/2; y++){
         src = ycurr_ptr + (y << 2);
         dst = dest_ptr + (x*640*3 + ((y << 1)*3));

         __constant float* coeffs = c_YUV2RGBCoeffs_420;

int load_src = *((int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[1] - 128;
float V = vec_src[3] - 128;
float y00 = max(0.f, vec_src[0] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[2] - 16.f) * coeffs[0];
float ruv = ((coeffs[4]*V)+0.5f);
float guv = (coeffs[3]*V)+((coeffs[2]*U)+0.5f);
float buv = ((coeffs[1]*U)+0.5f);

                  dst[2] = convert_uchar_sat(y00 + ruv);
                  dst[1] = convert_uchar_sat(y00 + guv);
                  dst[0] = convert_uchar_sat(y00 + buv);
                  dst[5] = convert_uchar_sat(y01 + ruv);
                  dst[4] = convert_uchar_sat(y01 + guv);
                  dst[3] = convert_uchar_sat(y01 + buv);
         }

  }
EdmaMgr_wait(evIN);
EdmaMgr_free(evIN);

         clk_end = __clock();
         printf ("TIDSP clockdiff=%d\n", clk_end - clk_start);
         ///////////////////66420487
}

3. The result show both of their execution time are 66420487.


4. My question is: why EDMA does not have a better performance for DSP acceleration?

Thanks.