Hi,
I have created an openvx kernel that copies a vx_tensor to another. Each core is specified as a target for the kernel, the only difference is that the A72 uses a memcpy and the C7X, C66, R5F use UDMA.
Here are the numbers I got for copying a vx_tensor with a size of 864*128*sizeof(float) = 442368 bytes
Core | memcpy |
appUdmaCopy1D
|
A72 | 670 us | N/A |
C66 | 4889 us | 2237 us |
C7X | 1616 us | 2003 us |
R5F | 18097 us | 5794 us |
The numbers I'm getting are quite disappoiting, it seems that the UDMA is slower than doing an A72 memcpy. On the C7x, It's even slower that doing a mempcy.
Are these numbers normal? I haven't found any spec for UDMA.
Here's my (simplified) kernel code:
bool TensorBlockCpyDma(const size_t nBytes, const tivx_obj_desc_tensor_t* const srcDescriptor, const tivx_obj_desc_tensor_t* const dstDescriptor) { bool ret = false; if ( (0U == nBytes) || (NULL == srcDescriptor) || (NULL == dstDescriptor)) { VX_PRINT(VX_ZONE_ERROR, "Invalid input pointer\n"); } else { uint64_t srcPhys = tivxMemShared2PhysPtr(srcDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST); uint64_t dstPhys = tivxMemShared2PhysPtr(dstDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST); app_udma_copy_1d_prms_t prms; appUdmaCopy1DPrms_Init(&prms); prms.dest_addr = dstPhys; prms.src_addr = srcPhys; prms.length = nBytes; if (0 == appUdmaCopy1D(NULL, &prms)) { ret = true; } } return ret; } static vx_status VX_CALLBACK tivxTensorcpyProcess( tivx_target_kernel_instance kernel, tivx_obj_desc_t *obj_desc[], uint16_t num_params, void *priv_arg) { vx_status status = (vx_status)VX_SUCCESS; const tivx_obj_desc_tensor_t *src_desc; const tivx_obj_desc_tensor_t *dst_desc; if ( (num_params != TIVX_KERNEL_TENSORCPY_MAX_PARAMS) || (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX]) || (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX]) ) { status = (vx_status)VX_FAILURE; } if((vx_status)VX_SUCCESS == status) { src_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX]; dst_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX]; } if((vx_status)VX_SUCCESS == status) { void *src_target_ptr; void *dst_target_ptr; src_target_ptr = tivxMemShared2TargetPtr(&src_desc->mem_ptr); tivxCheckStatus(&status, tivxMemBufferMap(src_target_ptr, src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST, (vx_enum)VX_READ_ONLY)); dst_target_ptr = tivxMemShared2TargetPtr(&dst_desc->mem_ptr); tivxCheckStatus(&status, tivxMemBufferMap(dst_target_ptr, dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST, (vx_enum)VX_WRITE_ONLY)); { /* call kernel processing function */ uint32_t start = tivxPlatformGetTimeInUsecs(); #ifdef A72 memcpy(dst_target_ptr, src_target_ptr, src_desc->mem_size); #else if (!TensorBlockCpyDma(src_desc->mem_size, src_desc, dst_desc)) { VX_PRINT(VX_ZONE_ERROR, "TensorBlockCpyDma failed\n"); } #endif uint32_t delta = tivxPlatformGetTimeInUsecs() - start; VX_PRINT(VX_ZONE_WARNING, "TensorBlockCpyDma copied %u bytes in %u us\n", src_desc->mem_size, delta); /* kernel processing function complete */ } tivxCheckStatus(&status, tivxMemBufferUnmap(src_target_ptr, src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST, (vx_enum)VX_READ_ONLY)); tivxCheckStatus(&status, tivxMemBufferUnmap(dst_target_ptr, dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST, (vx_enum)VX_WRITE_ONLY)); } return status; }
The A72 kernel calls memcpy() instead of appUdmaCopy1D().
Thank you,
Fred