Hi,
I have created an openvx kernel that copies a vx_tensor to another. Each core is specified as a target for the kernel, the only difference is that the A72 uses a memcpy and the C7X, C66, R5F use UDMA.
Here are the numbers I got for copying a vx_tensor with a size of 864*128*sizeof(float) = 442368 bytes
| Core | memcpy |
appUdmaCopy1D
|
| A72 | 670 us | N/A |
| C66 | 4889 us | 2237 us |
| C7X | 1616 us | 2003 us |
| R5F | 18097 us | 5794 us |
The numbers I'm getting are quite disappoiting, it seems that the UDMA is slower than doing an A72 memcpy. On the C7x, It's even slower that doing a mempcy.
Are these numbers normal? I haven't found any spec for UDMA.
Here's my (simplified) kernel code:
bool TensorBlockCpyDma(const size_t nBytes,
const tivx_obj_desc_tensor_t* const srcDescriptor,
const tivx_obj_desc_tensor_t* const dstDescriptor)
{
bool ret = false;
if ( (0U == nBytes)
|| (NULL == srcDescriptor)
|| (NULL == dstDescriptor))
{
VX_PRINT(VX_ZONE_ERROR, "Invalid input pointer\n");
}
else
{
uint64_t srcPhys = tivxMemShared2PhysPtr(srcDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST);
uint64_t dstPhys = tivxMemShared2PhysPtr(dstDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST);
app_udma_copy_1d_prms_t prms;
appUdmaCopy1DPrms_Init(&prms);
prms.dest_addr = dstPhys;
prms.src_addr = srcPhys;
prms.length = nBytes;
if (0 == appUdmaCopy1D(NULL, &prms))
{
ret = true;
}
}
return ret;
}
static vx_status VX_CALLBACK tivxTensorcpyProcess(
tivx_target_kernel_instance kernel,
tivx_obj_desc_t *obj_desc[],
uint16_t num_params, void *priv_arg)
{
vx_status status = (vx_status)VX_SUCCESS;
const tivx_obj_desc_tensor_t *src_desc;
const tivx_obj_desc_tensor_t *dst_desc;
if ( (num_params != TIVX_KERNEL_TENSORCPY_MAX_PARAMS)
|| (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX])
|| (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX])
)
{
status = (vx_status)VX_FAILURE;
}
if((vx_status)VX_SUCCESS == status)
{
src_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX];
dst_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX];
}
if((vx_status)VX_SUCCESS == status)
{
void *src_target_ptr;
void *dst_target_ptr;
src_target_ptr = tivxMemShared2TargetPtr(&src_desc->mem_ptr);
tivxCheckStatus(&status, tivxMemBufferMap(src_target_ptr,
src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_READ_ONLY));
dst_target_ptr = tivxMemShared2TargetPtr(&dst_desc->mem_ptr);
tivxCheckStatus(&status, tivxMemBufferMap(dst_target_ptr,
dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_WRITE_ONLY));
{
/* call kernel processing function */
uint32_t start = tivxPlatformGetTimeInUsecs();
#ifdef A72
memcpy(dst_target_ptr, src_target_ptr, src_desc->mem_size);
#else
if (!TensorBlockCpyDma(src_desc->mem_size, src_desc, dst_desc))
{
VX_PRINT(VX_ZONE_ERROR, "TensorBlockCpyDma failed\n");
}
#endif
uint32_t delta = tivxPlatformGetTimeInUsecs() - start;
VX_PRINT(VX_ZONE_WARNING, "TensorBlockCpyDma copied %u bytes in %u us\n", src_desc->mem_size, delta);
/* kernel processing function complete */
}
tivxCheckStatus(&status, tivxMemBufferUnmap(src_target_ptr,
src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_READ_ONLY));
tivxCheckStatus(&status, tivxMemBufferUnmap(dst_target_ptr,
dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_WRITE_ONLY));
}
return status;
}
The A72 kernel calls memcpy() instead of appUdmaCopy1D().
Thank you,
Fred