Part Number: PROCESSOR-SDK-AM64X
Hi, everyone!
I wonder that udma transfer is not faster than memcpy transfer.
I use the example "udma_memcpy_interrupt_am64x-evm_a53ss0-0_freertos_gcc-aarch64" in mcu_plus_sdk_am64x_08_02_00_31.
And I modify udma_memcpy_interrupt.c adding memcpy and cycle counter coding. (attached file)
/*
* Copyright (C) 2021 Texas Instruments Incorporated
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
*
* Neither the name of Texas Instruments Incorporated nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <kernel/dpl/CacheP.h>
#include <kernel/dpl/DebugP.h>
#include <kernel/dpl/SemaphoreP.h>
#include <kernel/dpl/CycleCounterP.h>
#include "ti_drivers_config.h"
#include "ti_drivers_open_close.h"
#include "ti_board_open_close.h"
/*
* This example performs UDMA block copy transfer using Type 15 Transfer Record (TR15)
* using Transfer Record Packet Descriptor (TRPD) in interrupt mode.
*
* The application opens and configures a BCDMA channel using SysConfig.
* It also configures the interrupt mode of operation through the SysConfig
* which ensures that all required interrupt configuration are done.
* The callback function App_udmaEventCb is registered via SysConfig.
*
* Then the application prepares a TRPD for a 1D transfer from source to
* destination buffer, submits the request to DMA, waits for the DMA to complete
* by waiting on a semaphore which is posted in the callback function.
*
* Once the transfer it completes, it does cache operation for data coherency
* and compares the source and destination buffers for any data mismatch.
*
*/
/* Number of bytes to do memcpy */
#define UDMA_TEST_NUM_BYTES (32768U)
/* UDMA TR packet descriptor memory size - with one TR */
#define UDMA_TEST_TRPD_SIZE (UDMA_GET_TRPD_TR15_SIZE(1U))
/* UDMA TRPD Memory */
uint8_t gUdmaTestTrpdMem[UDMA_TEST_TRPD_SIZE] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT)));
/* Application Buffers */
uint8_t gUdmaTestSrcBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT)));
uint8_t gUdmaTestDestBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT)));
uint8_t gMcpyTestSrcBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT)));
uint8_t gMcpyTestDestBuf[UDMA_ALIGN_SIZE(UDMA_TEST_NUM_BYTES)] __attribute__((aligned(UDMA_CACHELINE_ALIGNMENT)));
/* Semaphore to indicate transfer completion */
static SemaphoreP_Object gUdmaTestDoneSem;
void App_udmaEventCb(Udma_EventHandle eventHandle, uint32_t eventType, void *appData);
static void App_udmaTrpdInit(Udma_ChHandle chHandle,
uint8_t *trpdMem,
const void *destBuf,
const void *srcBuf,
uint32_t length);
static void App_udmaInitBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length);
static void App_udmaCompareBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length);
void *udma_memcpy_interrupt_main(void *args)
{
int32_t retVal = UDMA_SOK, status;
Udma_ChHandle chHandle;
uint8_t *srcBuf = &gUdmaTestSrcBuf[0U];
uint8_t *destBuf = &gUdmaTestDestBuf[0U];
uint32_t length = UDMA_TEST_NUM_BYTES;
uint64_t pDesc;
uint32_t trRespStatus;
uint8_t *trpdMem = &gUdmaTestTrpdMem[0U];
uint64_t trpdMemPhy = (uint64_t) Udma_defaultVirtToPhyFxn(trpdMem, 0U, NULL);
/* */
uint32_t cycle;
uint64_t curTime, avgTime = 0, maxTime = 0, minTime = 0;
/* Open drivers to open the UART driver for console */
Drivers_open();
Board_driversOpen();
chHandle = gConfigUdma0BlkCopyChHandle[0]; /* Has to be done after driver open */
DebugP_log("[UDMA] Memcpy %d bytes application started ...\r\n", UDMA_TEST_NUM_BYTES);
status = SemaphoreP_constructBinary(&gUdmaTestDoneSem, 0);
DebugP_assert(SystemP_SUCCESS == status);
/* */
CycleCounterP_reset(); /* enable and reset CPU cycle counter */
avgTime = 0, minTime = 1000, maxTime = 0;
for (cycle = 1; cycle <= 1000; cycle++)
{
App_udmaInitBuf(&gMcpyTestSrcBuf[0U], &gMcpyTestDestBuf[0U], length);
curTime = ClockP_getTimeUsec(); /* get time as measured by timer associated with ClockP module */
memcpy(&gMcpyTestDestBuf[0U], &gMcpyTestSrcBuf[0U], length);
curTime = ClockP_getTimeUsec() - curTime; /* get time and calculate diff, ClockP returns 64b value so there wont be overflow here */
/* Compare data */
App_udmaCompareBuf(srcBuf, destBuf, length);
avgTime += curTime;
maxTime = (maxTime < curTime)? curTime : maxTime;
minTime = (minTime > curTime)? curTime : minTime;
DebugP_log("[DPL] memcpy %d... DONE (Measured avg: %d/ max: %d/ min: %d/ usecs) !\r", cycle
, (uint32_t)(avgTime/cycle), (uint32_t)maxTime, (uint32_t)minTime);
}
DebugP_log("\n");
/* */
/* Channel enable */
retVal = Udma_chEnable(chHandle);
DebugP_assert(UDMA_SOK == retVal);
avgTime = 0, minTime = 1000, maxTime = 0;
for (cycle = 1; cycle <= 1000; cycle++)
{
/* Init buffers and TR packet descriptor */
App_udmaInitBuf(srcBuf, destBuf, length);
App_udmaTrpdInit(chHandle, trpdMem, destBuf, srcBuf, length);
/* Submit TRPD to channel */
retVal = Udma_ringQueueRaw(Udma_chGetFqRingHandle(chHandle), trpdMemPhy);
DebugP_assert(UDMA_SOK == retVal);
/* Wait for return descriptor in completion ring - this marks transfer completion */
curTime = ClockP_getTimeUsec(); /* get time as measured by timer associated with ClockP module */
SemaphoreP_pend(&gUdmaTestDoneSem, SystemP_WAIT_FOREVER);
curTime = ClockP_getTimeUsec() - curTime; /* get time and calculate diff, ClockP returns 64b value so there wont be overflow here */
retVal = Udma_ringDequeueRaw(Udma_chGetCqRingHandle(chHandle), &pDesc);
DebugP_assert(UDMA_SOK == retVal);
/* Check TR response status */
CacheP_inv(trpdMem, UDMA_TEST_TRPD_SIZE, CacheP_TYPE_ALLD);
trRespStatus = UdmaUtils_getTrpdTr15Response(trpdMem, 1U, 0U);
DebugP_assert(CSL_UDMAP_TR_RESPONSE_STATUS_COMPLETE == trRespStatus);
/* Compare data */
App_udmaCompareBuf(srcBuf, destBuf, length);
avgTime += curTime;
maxTime = (maxTime < curTime)? curTime : maxTime;
minTime = (minTime > curTime)? curTime : minTime;
DebugP_log("[DPL] UDMA %d... DONE (Measured avg: %d/ max: %d/ min: %d/ usecs) !\r", cycle
, (uint32_t)(avgTime/cycle), (uint32_t)maxTime, (uint32_t)minTime);
}
DebugP_log("\n");
/* Channel disable */
retVal = Udma_chDisable(chHandle, UDMA_DEFAULT_CH_DISABLE_TIMEOUT);
DebugP_assert(UDMA_SOK == retVal);
SemaphoreP_destruct(&gUdmaTestDoneSem);
DebugP_log("All tests have passed!!\r\n");
Board_driversClose();
Drivers_close();
return NULL;
}
void App_udmaEventCb(Udma_EventHandle eventHandle, uint32_t eventType, void *appData)
{
if(UDMA_EVENT_TYPE_DMA_COMPLETION == eventType)
{
SemaphoreP_post(&gUdmaTestDoneSem);
}
}
static void App_udmaTrpdInit(Udma_ChHandle chHandle,
uint8_t *trpdMem,
const void *destBuf,
const void *srcBuf,
uint32_t length)
{
CSL_UdmapTR15 *pTr;
uint32_t cqRingNum = Udma_chGetCqRingNum(chHandle);
/* Make TRPD with TR15 TR type */
UdmaUtils_makeTrpdTr15(trpdMem, 1U, cqRingNum);
/* Setup TR */
pTr = UdmaUtils_getTrpdTr15Pointer(trpdMem, 0U);
pTr->flags = CSL_FMK(UDMAP_TR_FLAGS_TYPE, CSL_UDMAP_TR_FLAGS_TYPE_4D_BLOCK_MOVE_REPACKING_INDIRECTION);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_STATIC, 0U);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOL, CSL_UDMAP_TR_FLAGS_EOL_MATCH_SOL_EOL);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EVENT_SIZE, CSL_UDMAP_TR_FLAGS_EVENT_SIZE_COMPLETION);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER0_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1, CSL_UDMAP_TR_FLAGS_TRIGGER_NONE);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_TRIGGER1_TYPE, CSL_UDMAP_TR_FLAGS_TRIGGER_TYPE_ALL);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_CMD_ID, 0x25U); /* This will come back in TR response */
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_SA_INDIRECT, 0U);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_DA_INDIRECT, 0U);
pTr->flags |= CSL_FMK(UDMAP_TR_FLAGS_EOP, 1U);
pTr->icnt0 = length;
pTr->icnt1 = 1U;
pTr->icnt2 = 1U;
pTr->icnt3 = 1U;
pTr->dim1 = pTr->icnt0;
pTr->dim2 = (pTr->icnt0 * pTr->icnt1);
pTr->dim3 = (pTr->icnt0 * pTr->icnt1 * pTr->icnt2);
pTr->addr = (uint64_t) Udma_defaultVirtToPhyFxn(srcBuf, 0U, NULL);
pTr->fmtflags = 0x00000000U; /* Linear addressing, 1 byte per elem */
pTr->dicnt0 = length;
pTr->dicnt1 = 1U;
pTr->dicnt2 = 1U;
pTr->dicnt3 = 1U;
pTr->ddim1 = pTr->dicnt0;
pTr->ddim2 = (pTr->dicnt0 * pTr->dicnt1);
pTr->ddim3 = (pTr->dicnt0 * pTr->dicnt1 * pTr->dicnt2);
pTr->daddr = (uint64_t) Udma_defaultVirtToPhyFxn(destBuf, 0U, NULL);
/* Perform cache writeback */
CacheP_wb(trpdMem, UDMA_TEST_TRPD_SIZE, CacheP_TYPE_ALLD);
return;
}
static void App_udmaInitBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length)
{
uint32_t i;
for(i = 0U; i < length; i++)
{
srcBuf[i] = i;
destBuf[i] = 0xA5U;
}
/* Writeback source and destination buffer */
CacheP_wb(srcBuf, length, CacheP_TYPE_ALLD);
CacheP_wb(destBuf, length, CacheP_TYPE_ALLD);
return;
}
static void App_udmaCompareBuf(uint8_t *srcBuf, uint8_t *destBuf, uint32_t length)
{
uint32_t i;
/* Invalidate destination buffer */
CacheP_inv(destBuf, length, CacheP_TYPE_ALLD);
for(i = 0U; i < length; i++)
{
if(srcBuf[i] != destBuf[i])
{
DebugP_logError("Data mismatch !!!\r\n");
DebugP_assert(FALSE);
}
}
return;
}
Result as follows....
[UDMA] Memcpy 32768 bytes application started ...
[DPL] memcpy 1000... DONE (Measured avg: 10/ max: 13/ min: 9/ usecs) !
[DPL] UDMA 1000... DONE (Measured avg: 71/ max: 75/ min: 69/ usecs) !
All tests have passed!!
At the average value, memcpy = 10 usec versus UDMA = 71 usec !
Who can explain this reason?
Regards,