Hi, everyone!
I wonder that udma transfer is not faster than memcpy transfer.
I use the example "udma_memcpy_interrupt_am64x-evm_a53ss0-0_freertos_gcc-aarch64" in mcu_plus_sdk_am64x_08_02_00_31.
And I modify udma_memcpy_interrupt.c adding memcpy and cycle counter coding. (attached file)
/* * Copyright (C) 2021 Texas Instruments Incorporated * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * Neither the name of Texas Instruments Incorporated nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <string.h> #include <kernel/dpl/CacheP.h> #include <kernel/dpl/DebugP.h> #include <kernel/dpl/SemaphoreP.h> #include <kernel/dpl/CycleCounterP.h> #include "ti_drivers_config.h" #include "ti_drivers_open_close.h" #include "ti_board_open_close.h" /* * This example performs UDMA block copy transfer using Type 15 Transfer Record (TR15) * using Transfer Record Packet Descriptor (TRPD) in interrupt mode. * * The application opens and configures a BCDMA channel using SysConfig. * It also configures the interrupt mode of operation through the SysConfig * which ensures that all required interrupt configuration are done. * The callback function App_udmaEventCb is registered via SysConfig. * * Then the application prepares a TRPD for a 1D transfer from source to * destination buffer, submits the request to DMA, waits for the DMA to complete * by waiting on a semaphore which is posted in the callback function. * * Once the transfer it completes, it does cache operation for data coherency * and compares the source and destination buffers for any data mismatch. * */ /* Number of bytes to do memcpy */ #define UDMA_TEST_NUM_BYTES (32768U) /* UDMA TR packet descriptor memory size - with one TR */ #define UDMA_TEST_TRPD_SIZE (UDMA_GET_TRPD_TR15_SIZE(1U)) /* UDMA TRPD Memory */ uint8_t gUdmaTestTrpdMem[UDMA_TEST_TRPD_SIZE] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT))); /* Application Buffers */ uint8_t gUdmaTestSrcBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT))); uint8_t gUdmaTestDestBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT))); uint8_t gMcpyTestSrcBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT))); uint8_t gMcpyTestDestBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT))); /* Semaphore to indicate transfer completion */ static SemaphoreP_Object gUdmaTestDoneSem; void App_udmaEventCb(Udma_EventHandle eventHandle, uint32_t eventType, void *appData); static void App_udmaTrpdInit(Udma_ChHandle chHandle, uint8_t *trpdMem, const void *destBuf, const void *srcBuf, uint32_t length); static void App_udmaInitBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length); static void App_udmaCompareBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length); void *udma_memcpy_interrupt_main(void *args) { int32_t retVal = UDMA_SOK, status; Udma_ChHandle chHandle; uint8_t *srcBuf = &gUdmaTestSrcBuf[0U]; uint8_t *destBuf = &gUdmaTestDestBuf[0U]; uint32_t length = UDMA_TEST_NUM_BYTES; uint64_t pDesc; uint32_t trRespStatus; uint8_t *trpdMem = &gUdmaTestTrpdMem[0U]; uint64_t trpdMemPhy = (uint64_t) Udma_defaultVirtToPhyFxn(trpdMem, 0U, NULL); /* */ uint32_t cycle; uint64_t curTime, avgTime = 0, maxTime = 0, minTime = 0; /* Open drivers to open the UART driver for console */ Drivers_open(); Board_driversOpen(); chHandle = gConfigUdma0BlkCopyChHandle[0]; /* Has to be done after driver open */ DebugP_log("[UDMA] Memcpy %d bytes application started ...\r\n", UDMA_TEST_NUM_BYTES); status = SemaphoreP_constructBinary(&gUdmaTestDoneSem, 0); DebugP_assert(SystemP_SUCCESS == status); /* */ CycleCounterP_reset(); /* enable and reset CPU cycle counter */ avgTime = 0, minTime = 1000, maxTime = 0; for (cycle = 1; cycle <= 1000; cycle++) { App_udmaInitBuf(&gMcpyTestSrcBuf[0U], &gMcpyTestDestBuf[0U], length); curTime = ClockP_getTimeUsec(); /* get time as measured by timer associated with ClockP module */ memcpy(&gMcpyTestDestBuf[0U], &gMcpyTestSrcBuf[0U], length); curTime = ClockP_getTimeUsec() - curTime; /* get time and calculate diff, ClockP returns 64b value so there wont be overflow here */ /* Compare data */ App_udmaCompareBuf(srcBuf, destBuf, length); avgTime += curTime; maxTime = (maxTime < curTime)? curTime : maxTime; minTime = (minTime > curTime)? curTime : minTime; DebugP_log("[DPL] memcpy %d... DONE (Measured avg: %d/ max: %d/ min: %d/ usecs) !\r", cycle , (uint32_t)(avgTime/cycle), (uint32_t)maxTime, (uint32_t)minTime); } DebugP_log("\n"); /* */ /* Channel enable */ retVal = Udma_chEnable(chHandle); DebugP_assert(UDMA_SOK == retVal); avgTime = 0, minTime = 1000, maxTime = 0; for (cycle = 1; cycle <= 1000; cycle++) { /* Init buffers and TR packet descriptor */ App_udmaInitBuf(srcBuf, destBuf, length); App_udmaTrpdInit(chHandle, trpdMem, destBuf, srcBuf, length); /* Submit TRPD to channel */ retVal = Udma_ringQueueRaw(Udma_chGetFqRingHandle(chHandle), trpdMemPhy); DebugP_assert(UDMA_SOK == retVal); /* Wait for return descriptor in completion ring - this marks transfer completion */ curTime = ClockP_getTimeUsec(); /* get time as measured by timer associated with ClockP module */ SemaphoreP_pend(&gUdmaTestDoneSem, SystemP_WAIT_FOREVER); curTime = ClockP_getTimeUsec() - curTime; /* get time and calculate diff, ClockP returns 64b value so there wont be overflow here */ retVal = Udma_ringDequeueRaw(Udma_chGetCqRingHandle(chHandle), &pDesc); DebugP_assert(UDMA_SOK == retVal); /* Check TR response status */ CacheP_inv(trpdMem, UDMA_TEST_TRPD_SIZE, CacheP_TYPE_ALLD); trRespStatus = UdmaUtils_getTrpdTr15Response(trpdMem, 1U, 0U); DebugP_assert(CSL_UDMAP_TR_RESPONSE_STATUS_COMPLETE == trRespStatus); /* Compare data */ App_udmaCompareBuf(srcBuf, destBuf, length); avgTime += curTime; maxTime = (maxTime < curTime)? curTime : maxTime; minTime = (minTime > curTime)? curTime : minTime; DebugP_log("[DPL] UDMA %d... DONE (Measured avg: %d/ max: %d/ min: %d/ usecs) !\r", cycle , (uint32_t)(avgTime/cycle), (uint32_t)maxTime, (uint32_t)minTime); } DebugP_log("\n"); /* Channel disable */ retVal = Udma_chDisable(chHandle, UDMA_DEFAULT_CH_DISABLE_TIMEOUT); DebugP_assert(UDMA_SOK == retVal); SemaphoreP_destruct(&gUdmaTestDoneSem); DebugP_log("All tests have passed!!\r\n"); Board_driversClose(); Drivers_close(); return NULL; } void App_udmaEventCb(Udma_EventHandle eventHandle, uint32_t eventType, void *appData) { if(UDMA_EVENT_TYPE_DMA_COMPLETION == eventType) { SemaphoreP_post(&gUdmaTestDoneSem); } } static void App_udmaTrpdInit(Udma_ChHandle chHandle, uint8_t *trpdMem, const void *destBuf, const void *srcBuf, uint32_t length) { CSL_UdmapTR15 *pTr; uint32_t cqRingNum = Udma_chGetCqRingNum(chHandle); /* Make TRPD with TR15 TR type */ UdmaUtils_makeTrpdTr15(trpdMem, 1U, cqRingNum); /* Setup TR */ pTr = UdmaUtils_getTrpdTr15Pointer(trpdMem, 0U); pTr->flags = CSL_FMK(UDMAP_TR_FLAGS_TYPE, CSL_UDMAP_TR_FLAGS_TYPE_4D_BLOCK_MOVE_REPACKING_INDIRECTION); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_STATIC, 0U); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOL, CSL_UDMAP_TR_FLAGS_EOL_MATCH_SOL_EOL); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EVENT_SIZE, CSL_UDMAP_TR_FLAGS_EVENT_SIZE_COMPLETION); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_CMD_ID, 0x25U); /* This will come back in TR response */ pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_SA_INDIRECT, 0U); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_DA_INDIRECT, 0U); pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOP, 1U); pTr->icnt0 = length; pTr->icnt1 = 1U; pTr->icnt2 = 1U; pTr->icnt3 = 1U; pTr->dim1 = pTr->icnt0; pTr->dim2 = (pTr->icnt0 * pTr->icnt1); pTr->dim3 = (pTr->icnt0 * pTr->icnt1 * pTr->icnt2); pTr->addr = (uint64_t) Udma_defaultVirtToPhyFxn(srcBuf, 0U, NULL); pTr->fmtflags = 0x00000000U; /* Linear addressing, 1 byte per elem */ pTr->dicnt0 = length; pTr->dicnt1 = 1U; pTr->dicnt2 = 1U; pTr->dicnt3 = 1U; pTr->ddim1 = pTr->dicnt0; pTr->ddim2 = (pTr->dicnt0 * pTr->dicnt1); pTr->ddim3 = (pTr->dicnt0 * pTr->dicnt1 * pTr->dicnt2); pTr->daddr = (uint64_t) Udma_defaultVirtToPhyFxn(destBuf, 0U, NULL); /* Perform cache writeback */ CacheP_wb(trpdMem, UDMA_TEST_TRPD_SIZE, CacheP_TYPE_ALLD); return; } static void App_udmaInitBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length) { uint32_t i; for(i = 0U; i < length; i++) { srcBuf[i] = i; destBuf[i] = 0xA5U; } /* Writeback source and destination buffer */ CacheP_wb(srcBuf, length, CacheP_TYPE_ALLD); CacheP_wb(destBuf, length, CacheP_TYPE_ALLD); return; } static void App_udmaCompareBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length) { uint32_t i; /* Invalidate destination buffer */ CacheP_inv(destBuf, length, CacheP_TYPE_ALLD); for(i = 0U; i < length; i++) { if(srcBuf[i] != destBuf[i]) { DebugP_logError("Data mismatch !!!\r\n"); DebugP_assert(FALSE); } } return; }
Result as follows....
[UDMA] Memcpy 32768 bytes application started ...
[DPL] memcpy 1000... DONE (Measured avg: 10/ max: 13/ min: 9/ usecs) !
[DPL] UDMA 1000... DONE (Measured avg: 71/ max: 75/ min: 69/ usecs) !
All tests have passed!!
At the average value, memcpy = 10 usec versus UDMA = 71 usec !
Who can explain this reason?
Regards,