Hi,
SDK version: RTOS_J721E_EVM_08_02_00_05
I created a function that transpose a column to a row using UDMA (with DRU enabled) in order to accelerate the data transfer. However i'm getting some bad results when compared to a software version of column to row.
Is this an expected behaviour for the UDMA when copying small chunks of data.
Below are the results when executed on the C7X core, with 2 float matrix having <rows: 110592, cols = 1> and <rows: 1, cols = 110592>
UdmaNdCol2Row | 2847 us |
Col2Row | 1026 us |
Here are the functions used for both the UDMA and the SW column to row functions:
bool Col2Row(const uint32_t colIdx, const uint32_t rowIdx, const uint32_t inRows, const uint32_t inCols, const float* in, const uint32_t outRows, float* out) { bool ret = true; if ((NULL == in) || (NULL == out)) { ret = false; } else if ((colIdx >= inCols) || (rowIdx >= outRows)) { ret = false; } else { const uint32_t outCols = inRows; const float (* src)[inCols] = (const float(*)[inCols]) in; float (* dst)[outCols] = (float(*)[outCols]) out; for (uint32_t col = 0; col < outCols; col++) { dst[rowIdx][col] = src[col][colIdx]; } } return ret; } bool UdmaNdCpyTriggerAndWait(app_udma_copy_nd_prms_t *prms_nd) { bool ret = false; if (0 == appUdmaCopyNDInit(udmaDefaultCh, prms_nd)) { if (0 == appUdmaCopyNDTrigger(udmaDefaultCh)) { if (0 == appUdmaCopyNDWait(udmaDefaultCh)) { ret = true; } } appUdmaCopyNDDeinit(udmaDefaultCh); } return ret; } bool UdmaNdCpy(const UdmaNdPrms* parameters) { app_udma_copy_nd_prms_t prms_nd; bool ret = true; /* Set Up the Copy */ appUdmaCopyNDPrms_Init(&prms_nd); prms_nd.copy_mode = 0; prms_nd.eltype = 0; /* Source Params */ prms_nd.icnt0 = parameters->src.icnt0; prms_nd.icnt1 = parameters->src.icnt1; prms_nd.icnt2 = parameters->src.icnt2; prms_nd.icnt3 = parameters->src.icnt3; prms_nd.dim1 = parameters->src.dim1; prms_nd.dim2 = parameters->src.dim2; prms_nd.dim3 = parameters->src.dim3; prms_nd.src_addr = (parameters->src.addr + parameters->src.offset); /* Destination Params */ prms_nd.dicnt0 = parameters->dst.icnt0; prms_nd.dicnt1 = parameters->dst.icnt1; prms_nd.dicnt2 = parameters->dst.icnt2; prms_nd.dicnt3 = parameters->dst.icnt3; prms_nd.ddim1 = parameters->dst.dim1; prms_nd.ddim2 = parameters->dst.dim2; prms_nd.ddim3 = parameters->dst.dim3; prms_nd.dest_addr = (parameters->dst.addr + parameters->dst.offset); /* Start the copy op */ if (false == UdmaNdCpyTriggerAndWait(&prms_nd)) { VX_PRINT(VX_ZONE_ERROR, "UdmaNdCpy: UdmaNdCpyTriggerAndWait failed\n"); ret = false; } return ret; } bool UdmaNdCol2Row(size_t element_size, uint32_t col_idx, uint32_t row_idx, uint32_t in_rows, uint32_t in_cols, const void * in, uint32_t out_rows, void * out) { UdmaNdPrms prms; uint32_t out_cols = in_rows; uint16_t rows, remain_rows, blocks; bool ret = true; if ((col_idx >= in_cols) || (row_idx >= out_rows) || (element_size > UINT16_MAX)) { ret = false; } if (true == ret) { /* Set Up the Copy */ if (in_rows <= UINT16_MAX) { rows = in_rows; blocks = 1; remain_rows = 0; } else { rows = UINT16_MAX; blocks = in_rows / UINT16_MAX; remain_rows = in_rows - (rows * blocks); } // Source parameters prms.src.addr = (uint64_t)in; prms.src.offset = element_size * col_idx; prms.src.icnt0 = element_size; prms.src.icnt1 = rows; prms.src.icnt2 = blocks; prms.src.icnt3 = 1; prms.src.dim1 = element_size * in_cols; prms.src.dim2 = element_size * in_cols * rows; prms.src.dim3 = 0; // Destination parameters prms.dst.addr = (uint64_t)out; prms.dst.offset = element_size * out_cols * row_idx; prms.dst.icnt0 = element_size; prms.dst.icnt1 = rows; prms.dst.icnt2 = blocks; prms.dst.icnt3 = 1; prms.dst.dim1 = element_size; prms.dst.dim2 = element_size * rows; prms.dst.dim3 = 0; // Execute Copy ret = UdmaNdCpy(&prms); // copy remaining rows if any if ((true == ret) && (0 < remain_rows)) { // Source parameters prms.src.offset = prms.src.offset + (element_size * in_cols * rows * blocks); prms.src.icnt0 = element_size; prms.src.icnt1 = remain_rows; prms.src.icnt2 = 1; prms.src.icnt3 = 1; prms.src.dim1 = element_size * in_cols; prms.src.dim2 = 0; prms.src.dim3 = 0; // Destination parameters prms.dst.offset = prms.dst.offset + (element_size * rows * blocks); prms.dst.icnt0 = element_size; prms.dst.icnt1 = remain_rows; prms.dst.icnt2 = 1; prms.dst.icnt3 = 1; prms.dst.dim1 = element_size; prms.dst.dim2 = 0; prms.dst.dim3 = 0; // Execute Copy ret = UdmaNdCpy(&prms); } } // Invalid the cache if (true == ret) { appMemCacheInv(out, out_rows * in_rows * element_size); } return ret; }