TDA4VM: (C7X) Transposing a column of a 2D matrix to a row using UDMA is very slow compared to a software column to row function

Amine Hamidi

Part Number: TDA4VM

Hi,

SDK version: RTOS_J721E_EVM_08_02_00_05

I created a function that transpose a column to a row using UDMA (with DRU enabled) in order to accelerate the data transfer. However i'm getting some bad results when compared to a software version of column to row.

Is this an expected behaviour for the UDMA when copying small chunks of data.

Below are the results when executed on the C7X core, with 2 float matrix having <rows: 110592, cols = 1> and <rows: 1, cols = 110592>

UdmaNdCol2Row	2847 us
Col2Row	1026 us

Here are the functions used for both the UDMA and the SW column to row functions:

bool Col2Row(const uint32_t colIdx,
             const uint32_t rowIdx,
             const uint32_t inRows,
             const uint32_t inCols,
             const float* in,
             const uint32_t outRows,
             float* out)
{
    bool ret = true;
    if ((NULL == in) || (NULL == out))
    {
        ret = false;
    }
    else if ((colIdx >= inCols) || (rowIdx >= outRows))
    {
        ret = false;
    }
    else
    {
        const uint32_t outCols = inRows;

        const float (* src)[inCols] = (const float(*)[inCols]) in;
        float (* dst)[outCols]      = (float(*)[outCols]) out;
        
        for (uint32_t col = 0; col < outCols; col++)
        {
            dst[rowIdx][col] = src[col][colIdx];
        }
    }
    return ret;
}

bool UdmaNdCpyTriggerAndWait(app_udma_copy_nd_prms_t *prms_nd)
{
    bool ret = false;

    if (0 == appUdmaCopyNDInit(udmaDefaultCh, prms_nd))
    {
        if (0 == appUdmaCopyNDTrigger(udmaDefaultCh))
        {
            if (0 == appUdmaCopyNDWait(udmaDefaultCh))
            {
                ret = true;
            }
        }
        appUdmaCopyNDDeinit(udmaDefaultCh);
    }

    return ret;
}

bool UdmaNdCpy(const UdmaNdPrms* parameters)
{
    app_udma_copy_nd_prms_t prms_nd;
    bool ret = true;

    /* Set Up the Copy */
    appUdmaCopyNDPrms_Init(&prms_nd);
    prms_nd.copy_mode = 0;
    prms_nd.eltype = 0;
    
    /* Source Params */
    prms_nd.icnt0 = parameters->src.icnt0;
    prms_nd.icnt1 = parameters->src.icnt1;
    prms_nd.icnt2 = parameters->src.icnt2;
    prms_nd.icnt3 = parameters->src.icnt3;
    prms_nd.dim1 = parameters->src.dim1;
    prms_nd.dim2 = parameters->src.dim2;
    prms_nd.dim3 = parameters->src.dim3;
    prms_nd.src_addr = (parameters->src.addr + parameters->src.offset);
    
    /* Destination Params */
    prms_nd.dicnt0 = parameters->dst.icnt0;
    prms_nd.dicnt1 = parameters->dst.icnt1;
    prms_nd.dicnt2 = parameters->dst.icnt2;
    prms_nd.dicnt3 = parameters->dst.icnt3;
    prms_nd.ddim1 = parameters->dst.dim1;
    prms_nd.ddim2 = parameters->dst.dim2;
    prms_nd.ddim3 = parameters->dst.dim3;
    prms_nd.dest_addr = (parameters->dst.addr + parameters->dst.offset);

    /* Start the copy op */
    if (false == UdmaNdCpyTriggerAndWait(&prms_nd))
    {
        VX_PRINT(VX_ZONE_ERROR, "UdmaNdCpy: UdmaNdCpyTriggerAndWait failed\n");
        ret = false;
    }

    return ret;
}

bool UdmaNdCol2Row(size_t element_size,
                    uint32_t col_idx, uint32_t row_idx, 
                    uint32_t in_rows, uint32_t in_cols, const void * in,
                    uint32_t out_rows, void * out)
{
    UdmaNdPrms prms;
    uint32_t out_cols = in_rows;
    uint16_t rows, remain_rows, blocks;
    bool ret = true;
    
    if ((col_idx >= in_cols) || 
        (row_idx >= out_rows) || 
        (element_size > UINT16_MAX))
    {
        ret = false;
    }
    
    if (true == ret)
    {
        /* Set Up the Copy */
        if (in_rows <= UINT16_MAX)
        {
            rows = in_rows;
            blocks = 1;
            remain_rows = 0;
        }
        else
        {
            rows = UINT16_MAX;
            blocks = in_rows / UINT16_MAX;
            remain_rows = in_rows - (rows * blocks);
        }
        
        // Source parameters
        prms.src.addr = (uint64_t)in;
        prms.src.offset = element_size * col_idx;
        prms.src.icnt0 = element_size;
        prms.src.icnt1 = rows;
        prms.src.icnt2 = blocks;
        prms.src.icnt3 = 1;
        prms.src.dim1 = element_size * in_cols;
        prms.src.dim2 = element_size * in_cols * rows;
        prms.src.dim3 = 0;
        
        // Destination parameters
        prms.dst.addr = (uint64_t)out;
        prms.dst.offset = element_size * out_cols * row_idx;
        prms.dst.icnt0 = element_size;
        prms.dst.icnt1 = rows;
        prms.dst.icnt2 = blocks;
        prms.dst.icnt3 = 1;
        prms.dst.dim1 = element_size;
        prms.dst.dim2 = element_size * rows;
        prms.dst.dim3 = 0;
        
        // Execute Copy
        ret = UdmaNdCpy(&prms);
        
        // copy remaining rows if any
        if ((true == ret) && (0 < remain_rows))
        {
            // Source parameters
            prms.src.offset = prms.src.offset + (element_size * in_cols * rows * blocks);
            prms.src.icnt0 = element_size;
            prms.src.icnt1 = remain_rows;
            prms.src.icnt2 = 1;
            prms.src.icnt3 = 1;
            prms.src.dim1 = element_size *  in_cols;
            prms.src.dim2 = 0;
            prms.src.dim3 = 0;
            
            // Destination parameters
            prms.dst.offset = prms.dst.offset + (element_size * rows * blocks);
            prms.dst.icnt0 = element_size;
            prms.dst.icnt1 = remain_rows;
            prms.dst.icnt2 = 1;
            prms.dst.icnt3 = 1;
            prms.dst.dim1 = element_size;
            prms.dst.dim2 = 0;
            prms.dst.dim3 = 0;
            
            // Execute Copy
            ret = UdmaNdCpy(&prms);
        }
    }
    
    // Invalid the cache
    if (true == ret)
    {
        appMemCacheInv(out, out_rows * in_rows * element_size);
    }
    
    return ret;
}

over 2 years ago

0 Amine Hamidi over 2 years ago

Prodigy 145 points

Hi,

This a follow up on my previous comment.

SDK version: SDK-RTOS-J721E-EVM-08_05_00_11
We are using the app_udma driver provided with the sdk.
We are using 1 DRU channels (channel_idx >= (APP_UDMA_ND_CHANNELS_MAX / 2))
We are trying to perform a transpose operation (column to row).

We added a couple of new function to the app_udma driver that allow us to use the transpose feature on a DRU channel.

static void appUdmaTrpdSetTransposeND(
    app_udma_ch_obj_t *ch_obj,
    const app_udma_copy_nd_prms_t *prms_nd)
{
    uint32_t           *pTrResp;
    CSL_UdmapTR15      *pTr;

    /* Initialize TRPD memory - one time init like header etc... */
    appUdmaTrpdInit(ch_obj, prms_nd->copy_mode);

    pTr = (CSL_UdmapTR15 *)((uint8_t *)ch_obj->trpd_mem + sizeof(CSL_UdmapTR15));

    /* Set SRC params */
    pTr->icnt0  = prms_nd->icnt0;
    pTr->icnt1  = prms_nd->icnt1;
    pTr->icnt2  = prms_nd->icnt2;
    pTr->icnt3  = prms_nd->icnt3;
    pTr->dim1   = prms_nd->dim1;
    pTr->dim2   = prms_nd->dim2;
    pTr->dim3   = prms_nd->dim3;
    pTr->addr   = prms_nd->src_addr;

    /* Set DEST params */
    pTr->dicnt0  = prms_nd->dicnt0;
    pTr->dicnt1  = prms_nd->dicnt1;
    pTr->dicnt2  = prms_nd->dicnt2;
    pTr->dicnt3  = prms_nd->dicnt3;
    pTr->ddim1   = prms_nd->ddim1;
    pTr->ddim2   = prms_nd->ddim2;
    pTr->ddim3   = prms_nd->ddim3;
    pTr->daddr   = prms_nd->dest_addr;

    if((prms_nd->eltype == 1) || (prms_nd->eltype == 0))
    {
        /* Indicate 1 byte per element for transferring 8bit data */
        pTr->fmtflags |= CSL_FMK(UDMAP_TR_FMTFLAGS_ELYPE, CSL_UDMAP_TR_FMTFLAGS_ELYPE_1);
    }
    else if(prms_nd->eltype == 2)
    {
        /* Indicate 2 bytes per element for transferring 16bit data */
        pTr->fmtflags |= CSL_FMK(UDMAP_TR_FMTFLAGS_ELYPE, CSL_UDMAP_TR_FMTFLAGS_ELYPE_2);
    }
    else if(prms_nd->eltype == 3)
    {
        /* Indicate 3 bytes per element for transferring 24bit data */
        pTr->fmtflags |= CSL_FMK(UDMAP_TR_FMTFLAGS_ELYPE, CSL_UDMAP_TR_FMTFLAGS_ELYPE_3);
    }
    else if(prms_nd->eltype == 4)
    {
        /* Indicate 4 bytes per element for transferring 32bit data */
        pTr->fmtflags |= CSL_FMK(UDMAP_TR_FMTFLAGS_ELYPE, CSL_UDMAP_TR_FMTFLAGS_ELYPE_4);
    }

    // Set transpose mode
    pTr->fmtflags |= CSL_FMK(UDMAP_TR_FMTFLAGS_DFMT, CSL_UDMAP_TR_FMTFLAGS_DFMT_TRANSPOSE);

    /* Clear TR response memory */
    pTrResp = (uint32_t *) ((uint8_t *)ch_obj->trpd_mem + (sizeof(CSL_UdmapTR15) * 2U));
    *pTrResp = 0xFFFFFFFFU;

    /* Writeback TRPD memory */
    appUdmaCacheWb(ch_obj->trpd_mem, APP_UDMA_TRPD_SIZE_ALIGN);

    return;
}

int32_t appUdmaTransposeNDInit(
    app_udma_ch_handle_t ch_handle,
    const app_udma_copy_nd_prms_t *prms_nd)
{
    int32_t             retVal = UDMA_SOK;

    if(NULL == ch_handle)
    {
        appLogPrintf("UDMA : ERROR: ch_handle NULL Pointer!!!\n");
        retVal = UDMA_EFAIL;
    }

    if(NULL == prms_nd)
    {
        appLogPrintf("UDMA : ERROR: ch_obj NULL Pointer!!!\n");
        retVal = UDMA_EFAIL;
    }


    if(UDMA_SOK == retVal)
    {
        app_udma_ch_obj_t  *ch_obj = (app_udma_ch_obj_t *)ch_handle;

        /* Set the user provided transfer params */
        appUdmaTrpdSetTransposeND(ch_obj, prms_nd);

        if (0U == ch_obj->create_prms.use_ring)
        {
            Udma_chDruSubmitTr(ch_obj->drv_ch_handle, (CSL_UdmapTR *)((uint8_t *)ch_obj->trpd_mem + sizeof(CSL_UdmapTR15)));
        }
        else
        {
            /* Submit request */
            retVal = Udma_ringQueueRaw(
                     Udma_chGetFqRingHandle(ch_obj->drv_ch_handle), ch_obj->trpd_mem_phy);
            if(UDMA_SOK != retVal)
            {
                appLogPrintf("UDMA : ERROR: Channel queue failed!!\n");
            }
        }
    }

    return (retVal);
}

Below is the ocde used for our UDMA Col2Row transpose operation for float type (4 bytes).

static bool DmaCol2RowRun(uint64_t srcAdd, uint64_t dstAdd, uint16_t rows, uint16_t blocks, uint32_t inCols)
{
    app_udma_copy_nd_prms_t prms_nd;
    bool ret = true;

    appUdmaCopyNDPrms_Init(&prms_nd);
    prms_nd.copy_mode = 0;
    prms_nd.eltype = 4; // 32 bits transpose for the DRU channel

    prms_nd.icnt0 = 1;
    prms_nd.icnt1 = rows;
    prms_nd.icnt2 = blocks;
    prms_nd.icnt3 = 1;
    prms_nd.dim1 = 4 * inCols;
    prms_nd.dim2 = 4 * inCols * rows;
    prms_nd.dim3 = 0;
    prms_nd.src_addr = srcAdd;
    
    /* Destination Params */
    prms_nd.dicnt0 = 1;
    prms_nd.dicnt1 = rows;
    prms_nd.dicnt2 = blocks;
    prms_nd.dicnt3 = 1;
    prms_nd.ddim1 = 4;
    prms_nd.ddim2 = 4 * rows;
    prms_nd.ddim3 = 0;
    prms_nd.dest_addr = dstAdd;

    if (0 == appUdmaTransposeNDInit(udmaNdChannels[DMA_CH_IDX_DEFAULT].handle, &prms_nd))
    {
        if (0 == appUdmaCopyNDTrigger(udmaNdChannels[DMA_CH_IDX_DEFAULT].handle))
        {
            appUdmaCopyNDWait(udmaNdChannels[DMA_CH_IDX_DEFAULT].handle);
        }
        else
        {
            ret = false;
        }
        appUdmaCopyNDDeinit(udmaNdChannels[DMA_CH_IDX_DEFAULT].handle);
    }
    else
    {
        ret = false;
    }

    return ret;
}

bool DmaCol2Row(int32_t colIdx,
                uint32_t rowIdx,
                uint32_t inRows,
                uint32_t inCols,
                uint64_t inAdd,
                uint32_t outRows,
                uint64_t outAdd)
{
    bool ret = true;
    
    uint16_t rows;
    uint16_t blocks;
    uint16_t remainder;
    uint64_t srcPadd;
    uint64_t dstPadd;
    uint32_t outCols = inRows;

    /* DRU can only transpose 32 elememnts of size 32bits at one
       Refer to chapter 10.4.3.3.1 of TDA4 reference manual. */
    if (inRows <= 32)
    {
        rows = inRows;
        blocks = 1;
        remainder = 0;
    }
    else
    {
        rows = 32;
        blocks = inRows / 32;
        remainder = inRows - (rows * blocks);
    }

    srcPadd = appMemGetVirt2PhyBufPtr(inAdd, 0) + (4 * colIdx);
    dstPadd = appMemGetVirt2PhyBufPtr(outAdd, 0) + (4 * outCols * rowIdx);
    
    ret = DmaCol2RowRun(srcPadd, dstPadd, rows, blocks, inCols);

    if ((true == ret) && (0 < remainder))
    {
        srcPadd = srcPadd + (4 * inCols * rows * blocks);
        dstPadd = dstPadd + (4 * rows * blocks);
        ret = DmaCol2RowRun(srcPadd, dstPadd, remainder, 1, inCols);
    }

    if (true == ret)
    {
        appMemCacheInv((void *)(uintptr_t)outAdd, outRows * outCols * 4);
    }

    return ret;
}

And this is our Col2Row sw implementation for float type (4 bytes).

bool Col2Row(const uint32_t colIdx,
             const uint32_t rowIdx,
             const uint32_t inRows,
             const uint32_t inCols,
             const float* in,
             const uint32_t outRows,
             float* out)
{
    bool ret = true;
    if ((NULL == in) || (NULL == out))
    {
        ret = false;
    }
    else if ((colIdx >= inCols) || (rowIdx >= outRows))
    {
        ret = false;
    }
    else
    {
        const uint32_t outCols = inRows;

        const float (* src)[inCols] = (const float(*)[inCols]) in;
        float (* dst)[outCols]      = (float(*)[outCols]) out;
        
        for (uint32_t col = 0; col < outCols; col++)
        {
            dst[rowIdx][col] = src[col][colIdx];
        }
    }
    return ret;
}

So in order to compare the execution time of Col2Row and DmaCol2Row, we executed the following test.

We transpose the first column of a srcMat to the first row of a dstMat, given that the srcMat have 50k of rows and dstMat have 50k of columns.

We executed this test for:

srcMat (50K , n=1), dstMat (n=1 , 50K)
srcMat (50K , n=2), dstMat (n=2 , 50K)
srcMat (50K , n=3), dstMat (n=3 , 50K)
...
srcMat (50K , n=8), dstMat (n=8 , 50K)
srcMat (50K , n=9), dstMat (n=9 , 50K)
srcMat (50K , n=10), dstMat (n=10 , 50K)

We got the following results for execution time.

	n=1	n=2	n=3	n=4	n=5	n=6	n=7	n=8	n=9	n=10
Col2Row	400(us)	654(us)	903(us)	1155(us)	1406(us)	1644(us)	1918(us)	2179(us)	2440(us)	2698(us)
DmaCol2Row	1167(us)	1215(us)	1082(us)	1041(us)	935(us)	907(us)	832(us)	799(us)	679(us)	606(us)

While we're still tranposing the same amount of data (50K * sizeof(float)), the DMA transpose execution time is getting better each time we increase the number of columns of the srcMat. is this an expected behaviour for the DRU channel? or are we missing something in our code?

Regards

0 Brijesh Jadav over 2 years ago in reply to Amine Hamidi

TI__Guru**** 481095 points

Unlocking this ticket.

Processors

Processors forum

TDA4VM: (C7X) Transposing a column of a 2D matrix to a row using UDMA is very slow compared to a software column to row function