AM6421: Help with DMA configuration...

Part Number: AM6421

Tool/software:

Hello.

Need some help with adjusted example code with slightly different functionality.


I'm trying to configure DMA to transfer packets from a queue in TCM to DDR or MSRAM by using DMA.
Started from the swtrigger example, with the difference that total space available is different for source/destination queues.

To get into context and make things easier here are some definitions (test values):

- entrySize = 32 Bytes

- chunkSize = 28160 Bytes

- srcSize = 56320 Bytes

- destSize = 506880

The idea is to transfer a chunk with every sw_trigger event at a time... (hence src/dest sizes are multiple of chunkSize which also multiple of entrySize). The total chunks are not fixed/known and data is to be filled in src queue by an external driven event (GPIO interrupt).

The A53 is running Linux, and this code is running in R5FSS0_0 (single core mode).

Here is the code that is not working:

// TCM (by linker)
volatile BOOL bDMADone                  = TRUE;
uint32_t triggerMask                    = 0;
volatile uint32_t* ch0SwTriggerReg      = NULL;

Udma_EventObject gCh0TrEventObj         __attribute__((section("MSRAM"))) = { 0 };
Udma_EventHandle gCh0TrEventHandle      __attribute__((section("MSRAM"))) = NULL;
volatile Udma_EventPrms gCh0TrEventPrms __attribute__((section("MSRAM"))) = { 0 };
Udma_ChHandle ch0Handle                 __attribute__((section("MSRAM"))) = NULL;

uint8_t gUdmaTestTrpdMem[UDMA_TRPD_SIZE] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT), section("MSRAM")));

// Debug counters
volatile uint32_t uiEnqueues = 0;
volatile uint32_t uiEnqueueCBs = 0;

void App_udmaEventCb(Udma_EventHandle eventHandle, uint32_t eventType, void* appData)
{
    uiEnqueueCBs++;
    bDMADone = TRUE;
}

static void App_udmaTrpdInit(Udma_ChHandle chHandle, uint32_t chIdx, uint8_t* trpdMem, const void* destBuf, const void* srcBuf, uint32_t destSize, uint32_t srcSize, uint32_t chunkSize, uint32_t entrySize)
{
    CSL_UdmapTR15* pTr;
    uint32_t        cqRingNum = Udma_chGetCqRingNum(chHandle);

    /* Make TRPD with TR15 TR type */
    UdmaUtils_makeTrpdTr15(trpdMem, 1U, cqRingNum);

    /* Setup TR */
    pTr = UdmaUtils_getTrpdTr15Pointer(trpdMem, 0U);
    pTr->flags = CSL_FMK(UDMAP_TR_FLAGS_TYPE, CSL_UDMAP_TR_FLAGS_TYPE_4D_BLOCK_MOVE_REPACKING_INDIRECTION);
    //pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_WAIT, 1U);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_STATIC, 0U);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOL, CSL_UDMAP_TR_FLAGS_EOL_ICNT0_ICNT1);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EVENT_SIZE, CSL_UDMAP_TR_FLAGS_EVENT_SIZE_ICNT2_DEC);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0, CSL_UDMAP_TR_FLAGS_TRIGGER_GLOBAL0);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ICNT2_DEC);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_CMD_ID, 0x25U);  /* This will come back in TR response */
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_SA_INDIRECT, 0U);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_DA_INDIRECT, 0U);
    pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOP, 1U);

    pTr->addr = (uint64_t)Udma_defaultVirtToPhyFxn(srcBuf, 0U, NULL);
    pTr->icnt0 = entrySize;                 // 32B
    pTr->icnt1 = chunkSize / entrySize;     // 28160 / 32 = 880
    pTr->icnt2 = srcSize / chunkSize;       // 56320 / 28160 = 2
    pTr->icnt3 = (uint16_t)-1;              // ???? should be infinite ????
    pTr->dim1 = entrySize;                  // 32
    pTr->dim2 = chunkSize;                  // 28160
    pTr->dim3 = 0;

    pTr->daddr = (uint64_t)Udma_defaultVirtToPhyFxn(destBuf, 0U, NULL);
    pTr->dicnt0 = entrySize;                // 32B
    pTr->dicnt1 = chunkSize / entrySize;    // 28160 / 32 = 880
    pTr->dicnt2 = destSize / chunkSize;     // 506880 / 28160 = 18
    pTr->dicnt3 = (uint16_t)-1;             // ???? should be infinite ????
    pTr->ddim1 = entrySize;                 // 32
    pTr->ddim2 = chunkSize;                 // 28160
    pTr->ddim3 = 0;

    pTr->fmtflags = 0x00000000U;    /* Linear addressing, 1 byte per elem */

    /* Perform cache writeback */
    CacheP_wb(trpdMem, UDMA_TRPD_SIZE, CacheP_TYPE_ALLD);

    return;
}

static void App_udmaTriggerInit(void)
{
    int32_t retVal;
    Udma_DrvHandle  drvHandle = &gUdmaDrvObj[CONFIG_UDMA0];

    gCh0TrEventHandle = &gCh0TrEventObj;
    UdmaEventPrms_init((Udma_EventPrms*)& gCh0TrEventPrms);
    gCh0TrEventPrms.eventType = UDMA_EVENT_TYPE_TR;
    gCh0TrEventPrms.eventMode = UDMA_EVENT_MODE_SHARED;
    gCh0TrEventPrms.chHandle = ch0Handle;
    gCh0TrEventPrms.controllerEventHandle = NULL;
    gCh0TrEventPrms.eventCb = App_udmaEventCb;
    gCh0TrEventPrms.appData = NULL;
    retVal = Udma_eventRegister(drvHandle, gCh0TrEventHandle, (Udma_EventPrms*)&gCh0TrEventPrms);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_eventRegister ch0 = %d", retVal);

    retVal = Udma_chEnable(ch0Handle);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_chEnable ch0 = %d", retVal);

    return;
}

STATUS dma_init(VOID* destBuf, VOID* srcBuf, uint32_t destSize, uint32_t srcSize, uint32_t chunkSize, uint32_t entrySize)
{
    int32_t retVal = UDMA_SOK;
    uint8_t* trpdMem;
    uint64_t trpdMemPhy;

    srcBuf = (VOID*)TCM_ADDR_TO_PHY((uint32_t)srcBuf);

    ch0Handle = gConfigUdma0BlkCopyChHandle[0];  /* Has to be done after driver open */

    App_udmaTrpdInit(ch0Handle, 0, &gUdmaTestTrpdMem[0], destBuf, srcBuf, destSize, srcSize, chunkSize, entrySize);

    App_udmaTriggerInit();

    triggerMask = ((uint32_t)1U << (CSL_UDMAP_TR_FLAGS_TRIGGER_GLOBAL0 - 1U));
    ch0SwTriggerReg = (volatile uint32_t*)Udma_chGetSwTriggerRegister(ch0Handle);

    trpdMem = &gUdmaTestTrpdMem[0];
    trpdMemPhy = (uint64_t)Udma_defaultVirtToPhyFxn(trpdMem, 0U, NULL);
    retVal = Udma_ringQueueRaw(Udma_chGetFqRingHandle(ch0Handle), trpdMemPhy);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_ringQueueRaw ch0 = %d", retVal);

    return ERROR_SUCCESS;
}

VOID dma_enqueue(VOID)
{
    // Trigger DMA
    bDMADone = FALSE;
    CSL_REG32_WR(ch0SwTriggerReg, triggerMask);
    CSL_REG32_WR(ch0SwTriggerReg, 0U);
    uiEnqueues++;
}

BOOL dma_done(VOID)
{
    return bDMADone;
}

STATUS dma_end(VOID)
{
    int32_t retVal;
    uint64_t pDesc;

    retVal = Udma_chDisable(ch0Handle, UDMA_DEFAULT_CH_DISABLE_TIMEOUT);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_chDisable ch0 = %d", retVal);

    retVal = Udma_ringFlushRaw(Udma_chGetCqRingHandle(ch0Handle), &pDesc);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_ringDequeueRaw ch0 = %d", retVal);

    retVal = Udma_eventUnRegister(gCh0TrEventHandle);
    if (UDMA_SOK != retVal)
        DebugMsg("\nWarning! - Udma_eventUnRegister ch0 = %d", retVal);

    return ERROR_SUCCESS;
}

The calling order will be:

dma_init(xxxxx);
while (not_done) {
    dma_enqueue();
    if (dma_done())
        xxxxxxx; // dequeue src AND enqueue dest
}
dma_end();

Any help will be appreciated.

Thanks.

  • The query has been assigned to the expert. Please expect a response within a day or two.

    Thanks!

  • Hello Carlos Lega,

    You need to transfer data from TCM memory to MSRAM or DDR.

    So, here the Source address should be defined in the TCM memory.

    The TCM memory address is supposed to use the global address of TCM memory rather than R5F local view TCM memory for DMA operations.

    I am not sure how you defined Source memory in the Linker cmd file.

    The below example guides you  how to define the source memory in TCM memory. If you use the new MCU+SDk version, the user can't update the linker cmd file and instead of this, the user has to update the syscfg to define the source address in TCM.

    And, next, the icn3 and dcnt3 values should be equal to 1 and not be the maximum value. 

    Please look at the chapter below for more details about icnt and dicnt values.

    /cfs-file/__key/communityserver-discussions-components-files/791/Testcase_2D00_5-_2800_1_2900_.zip

    The FAQ below helps you if you want to trigger DMA based on the GPIO.

    https://e2e.ti.com/support/processors-group/processors/f/processors-forum/1378150/faq-how-to-trigger-dma-with-the-help-of-gpio-on-am64x-am243-and-am62x-devices

    Regards,

    Anil.

  • Hello Anil, thank you for your response...

    My problem is not the transfer itself, but the sequence to transfer it all. Let me explain some more details about it:

    1- Double checked and confirmed the addresses are as follows: srcBuf = 0x41012280 (which gets modified by TCM_ADDR_TO_PHY macro to be 0x78102280), destBuf = 0x70100080

    2- I already saw and studied the algorithm in your first image (TRM 11.1.3.3.2.1). According to that, on every cycle there's a test for TR_TRIG0_TYPE (also 1) to be asserted to continue. That makes me think that if it's not asserted, then the algorithm pauses until it's asserted again. Right? (BTW, I think that algorithm code has the last visible line wrong, and also missing a couple more lines.)

    3- The total amount of data to be transferred is not known, but certainly bigger than: (icnt0*icnt1*icnt2) as well bigger than: (dicnt0*dicnt1*dicnt3) - assuming icnt3=1, but as not known and need it to run past than that, the logical value for icnt3 would be the biggest.

    The problem I'm having is that once I setup DMA transfer details and trigger it, it doesn't stop (or pause). It does call back the CB function though, but doesn't pause... so it transfers "empty data space".


    Please see the below pseudo code and comments on how I would expect it to behave:
    NOTE: For simplicity I define srcBuffer and destBuffer as arrays with single position with chunkSize size.

  • void* srcBuf = 0x41012280;     // TCM
    void* destBuf = 0x70100080; // MSRAM
    uint32_t entrySize = 32;     // Bytes
    uint32_t chunkSize = 28160; // Bytes
    uint32_t srcSize = 56320;     // Bytes
    uint32_t destSize = 506880; // Bytes

    typedef struct _CHUNK_SIZE_T
    {
        uint8_t chunkData[28160];
    } CHUNK_SIZE_T;

    CHUNK_SIZE_T srcBuffer[2] __attribute((location(0x41012280))) = {0};
    CHUNK_SIZE_T destBuffer[3] __attribute((location(0x70100080))) = {0};

    dma_init(destBuf, srcBuf, destSize, srcSize, chunkSize, entrySize);

    while(not_done) // manual stop event
    {
        while(ready_to_dma_chunk_size)    // Manual trigger event
            dma_enqueue();    // Please see comments below on the expected data flow
    }
    dma_end();


     
    Please look to array indexes!, and keep in mind the "array" sizes might change (depends on other factors, not fixed)
    Expected flow would be like:

    not_done DEASSERTED
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[0] -> destBuffer[0]        
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[1] -> destBuffer[1]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[0] -> destBuffer[2]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[1] -> destBuffer[0]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[0] -> destBuffer[1]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[1] -> destBuffer[2]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[0] -> destBuffer[0]
    .
    .some time
    .
    ready_to_dma_chunk_size ASSERTED
                                        srcBuffer[1] -> destBuffer[1]
    .
    .some time
    .
    not_done ASSERTED

    P.S. Had problem embedding this as code. DKW

  • So, here the Source address should be defined in the TCM memory.

    The TCM memory address is supposed to use the global address of TCM memory rather than R5F local view TCM memory for DMA operations.

    Hello Carlos Lega,

    Can you please confirm if you are using the TCM address space as 0x4100xxxx, then try the global address space rather than the R5F TCM memory view ?

    If you look at the above attached example, you can get an idea since we are using TCM global address memory for DMA applications.

    Since, DMA does not know the R5F memory view.

    The total amount of data to be transferred is not known, but certainly bigger than: (icnt0*icnt1*icnt2) as well bigger than: (dicnt0*dicnt1*dicnt3) - assuming icnt3=1, but as not known and need it to run past than that, the logical value for icnt3 would be the biggest.

    Actually, you need to transfer data from TCMB to destination memory. In this case, you can transfer maximum 64KB and not more than you can't transfer.

    Regards,

    Anil.

  • Hello Anil, thanks again for your fast answer...

    Can you please confirm if you are using the TCM address space as 0x4100xxxx, then try the global address space rather than the R5F TCM memory view ?

    ...I'm sure I'm using the global address for TCM, that's what this line does:

    srcBuf = (VOID*)TCM_ADDR_TO_PHY((uint32_t)srcBuf);

    Actually, you need to transfer data from TCMB to destination memory. In this case, you can transfer maximum 64KB and not more than you can't transfer.

    I understand that on a single transfer, but my intention is to transfer around 32KB at a time many times.

    Also each "chunk" of ~32KB needs to be transferred when certain event has happened.

    My precise question/problem is:

    Given this TR configuration:

        pTr = UdmaUtils_getTrpdTr15Pointer(trpdMem, 0U);
        pTr->flags = CSL_FMK(UDMAP_TR_FLAGS_TYPE, CSL_UDMAP_TR_FLAGS_TYPE_4D_BLOCK_MOVE_REPACKING_INDIRECTION);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_STATIC, 0U);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOL, CSL_UDMAP_TR_FLAGS_EOL_ICNT0_ICNT1);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EVENT_SIZE, CSL_UDMAP_TR_FLAGS_EVENT_SIZE_ICNT2_DEC);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0, CSL_UDMAP_TR_FLAGS_TRIGGER_GLOBAL0);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ICNT2_DEC);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_CMD_ID, 0x25U);  /* This will come back in TR response */
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_SA_INDIRECT, 0U);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_DA_INDIRECT, 0U);
        pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOP, 1U);
    

    Should the DMA algorithm pause the transfer when already transferred icont0*icnt1 bytes?

  • Hello ,

    Yes, now I have got your problem.

    You can tell me what is the time interval you need to trigger DMA.

    Actually, the DMA only is triggered one time when you call the udma_queraw API .

    After completion of DMA, then again DMA will not start until you call the udma_queraw API.

    So, the idea behind is  if you want to trigger DMA for infinte times, then you need to call udma_queraw api for every DMA start.

    The attached example is helpful for you. Here, my use case is that to transfer data from DDR to TCM for  4KB of data for different R5F cores.

    I have created a timer interrupt for every 1msec and on the 1msec interrupt I triggered the DMA start. You can look at the A53 example , you will get an idea .

    Coming to TR, simply you can configure icnt0 = 32kB,icnt1, icn2 and inct3 = 1 and similarly you can configure for dicnt values in your application .

    Please let me know if you need any help.

    DMA_Broadcast_Periodic.zip

    Regards,

    Anil.

  • Hello Anil, thank you for your last example.

    You can tell me what is the time interval you need to trigger DMA.

    My specific use case is:

    RF50_0 is receiving data (20~40B packets) at ~1us (or faster if able to manage to flush it out - that's why I need it the fastest!).

    That data gets pushed into a TCM queue with a max size of less than 32KB. It's located in TCM to push as fast as possible (it has some calculations in the middle, so no DMA is possible here as of yet).

    Then transfer/sync with another bigger queue (~512KB) in DDR/MSRAM using DMA so A53 can access/use it.

    The problem is that the R5 needs to poll data at less than 1us, so any operation (including DMA queue/trigger) must be really fast. That's why I'm trying to trigger it only with the SW trigger.

    But then, need to also be able to stop it without completing the DMA enqueue TR data for many reasons.

    So my next question is, can I set icnt3 to infinite ((uint16_t)-1) and reliably/correctly stop the DMA transfer and be able to reuse the DMA channel without requiring to reboot the system? if so, how?

    Once again, thanks for your help.

    Regards, Carlos.

  • Hello Carlos,

    Let's assume that the R5F core fills 32 bytes for every 1usec in TCM . So, to fill 32KB bytes TCM then R5F takes almost ~1024usec.

    I hope DMA will be able to transfer DMA in 1024usec for 32KB. So, that you can, initiate the next transfer.

    But make sure that for every transaction you need to check if the previous transaction is completed or not.

    Otherwise, there might be a problem with TCM buffer corruption. Since, for every 1use R5F filles the TCM memory .

    And, if you configure icnt3 = 65535, then once start the DMA, then you DMA completion event is completed after the 65535*32*1024 bytes transfer.

    You don't need to configure icnt3 infinite  times in your application. If previous DMA transaction is not completed then do DMA channel teardown process.

    Then transfer/sync with another bigger queue (~512KB) in DDR/MSRAM using DMA so A53 can access/use it.

    You wanted to transfer DDR/MSRAM data to A53 core based on the TCM transactions ?

    I am not sure did you see above example or not ?

    Regards,

    Anil.

  • Hello Anil,

    I hope DMA will be able to transfer DMA in 1024usec for 32KB. So, that you can, initiate the next transfer.

    Yes it does transfer faster than R5F is filling the queue.

    Otherwise, there might be a problem with TCM buffer corruption. Since, for every 1use R5F filles the TCM memory

    Correct. That's why I'm using a ping-pong buffer (double DMA transfer before getting that queue full).

    But make sure that for every transaction you need to check if the previous transaction is completed or not.

    That's part of the problem. To check and re-trigger a new transfer takes more time than the R5F has, and that disrupts the R5F loop timing.

    And, if you configure icnt3 = 65535, then once start the DMA, then you DMA completion event is completed after the 65535*32*1024 bytes transfer.

    You don't need to configure icnt3 infinite  times in your application. If previous DMA transaction is not completed then do DMA channel teardown process.

    The problem is that the total data to be transferred might change. Initially, the user will trigger a (maybe) known amount of data but also might be able to stop the data stream. So the general flow is: the user starts a data flow, consisting of packets of ~20-40B every 1us and that data flow can be as long as the user wanted initially, OR can be stopped (by the user) at any time.

    This means: the R5F needs to be free to fully receive every packet on time, do some DSP process to it (it takes around 900ns to do this), and then enqueue in TCM. Here is when R5F doesn't have much time to deal with DMA check/re-trigger/configure.

    From TCM those packets need to be moved to a bigger queue in another location (DDR/MSRAM) to then be processed by A53. That's where DMA is really useful, but I also would like this DMA conf/triggering to be done by R5F instead of A53.

    Please confirm if there's a way to pause/stop/reconfigure DMA before it has finished the transfer and how to do it.

    Thanks. Carlos.