Hi team,
Here's an issue from the customer may need your help:
if ((coeffsHeight == 3) && (dilationHeight== 1) && (typeid(Tin) == typeid(uint8_t)) && height/strideHeight >= 8) { printf("Sy-gPtrL2RAM:%p--------------------------------20220303_1\n", gPtrL2RAM); //gPtrL2RAM:0x00800000 uint32_t uiMemUsedSize = 0; //Input Tin *pInData = (Tin *)(uiMemUsedSize + (uint32_t)gPtrL2RAM); uiMemUsedSize += (uint32_t)(numInChannels * inChPitch * sizeof(Tin)); appMemCacheWb(pInChannel, numInChannels * inChPitch * sizeof(Tin)); appMemCacheWb(pInData, numInChannels * inChPitch * sizeof(Tin)); app_udma_copy_2d_prms_t prms_2d_pInBuf; appUdmaCopy2DPrms_Init(&prms_2d_pInBuf); prms_2d_pInBuf.width = inChPitch * sizeof(Tin); prms_2d_pInBuf.height = numInChannels; prms_2d_pInBuf.dest_pitch = inChPitch * sizeof(Tin); prms_2d_pInBuf.src_pitch = inChPitch * sizeof(Tin); prms_2d_pInBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pInData, 1, NULL); //Udma_appVirtToPhyFxn prms_2d_pInBuf.src_addr = Udma_appVirtToPhyFxn((void *)pInChannel, 1, NULL); //appMemGetVirt2PhyBufPtr //prms_2d_pInBuf.dest_addr = (uint64_t)pInData; //prms_2d_pInBuf.src_addr = (uint64_t)pInChannel; appUdmaCopy2D(NULL, &prms_2d_pInBuf, 1); appMemCacheInv(pInData, numInChannels * inChPitch * sizeof(Tin)); //Weight Tw *pWeightData = (Tw *)(uiMemUsedSize + (uint32_t)gPtrL2RAM); uiMemUsedSize += (uint32_t)(numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw)); appMemCacheWb(pCoeffs, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw)); appMemCacheWb(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw)); app_udma_copy_2d_prms_t prms_2d_pWeightBuf; appUdmaCopy2DPrms_Init(&prms_2d_pWeightBuf); prms_2d_pWeightBuf.width = coeffsHeight * coeffsWidth * sizeof(Tw); prms_2d_pWeightBuf.height = numOutChannels * numInChannels; prms_2d_pWeightBuf.dest_pitch = coeffsHeight * coeffsWidth * sizeof(Tw); prms_2d_pWeightBuf.src_pitch = coeffsHeight * coeffsWidth * sizeof(Tw); prms_2d_pWeightBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pWeightData, 1, NULL); prms_2d_pWeightBuf.src_addr = Udma_appVirtToPhyFxn((void *)pCoeffs, 1, NULL); //prms_2d_pWeightBuf.dest_addr = (uint64_t)pWeightData; //prms_2d_pWeightBuf.src_addr = (uint64_t)pCoeffs; appUdmaCopy2D(NULL, &prms_2d_pWeightBuf, 1); appMemCacheInv(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw)); //Bias Tb *pBiasData = (Tb *)(uiMemUsedSize + (uint32_t)gPtrL2RAM); uiMemUsedSize += (uint32_t)(numOutChannels * sizeof(Tb)); appMemCacheWb(pBias, numOutChannels * sizeof(Tb)); appMemCacheWb(pBiasData, numOutChannels * sizeof(Tb)); app_udma_copy_1d_prms_t prms_1d_pBaisBuf; appUdmaCopy1DPrms_Init(&prms_1d_pBaisBuf); prms_1d_pBaisBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pBiasData, 1, NULL); prms_1d_pBaisBuf.src_addr = Udma_appVirtToPhyFxn((void *)pBias, 1, NULL); //prms_1d_pBaisBuf.dest_addr = (uint64_t)pBiasData; //prms_1d_pBaisBuf.src_addr = (uint64_t)pBias; prms_1d_pBaisBuf.length = numOutChannels * sizeof(Tb); appUdmaCopy1D(NULL, &prms_1d_pBaisBuf); appMemCacheInv(pBiasData, numOutChannels * sizeof(Tb)); Tacc *pOutData = (Tacc *)(uiMemUsedSize + (uint32_t)gPtrL2RAM); memset(pOutData, 0, numOutChannels * outChPitch * sizeof(Tacc)); uiMemUsedSize += (uint32_t)(numOutChannels * outChPitch * sizeof(Tacc)); ullCyclesStart = get_tsc(); TIDL_refConv2dKernel_i8u_c8s_o32s_3x3s1d1(pInData, pWeightData, pBiasData, pOutData , &min, &max, numTotRoi, numGroups, numInChannels, numOutChannels, inChPitch, outChPitch, width, height, inImPitch, outImPitch, coeffsWidth, coeffsHeight, dilationWidth, dilationHeight, strideWidth, strideHeight, params->enableBias); ullCyclesEnd = get_tsc(); printf("Sy-Conv cost cycles %llu\n", (ullCyclesEnd - ullCyclesStart)); appMemCacheWb(pOutData, numOutChannels * outChPitch * sizeof(Tacc)); appMemCacheWb(accPtr, numOutChannels * outChPitch * sizeof(Tacc)); app_udma_copy_2d_prms_t prms_2d_pOutBuf; appUdmaCopy2DPrms_Init(&prms_2d_pOutBuf); prms_2d_pOutBuf.width = outChPitch * sizeof(Tacc); prms_2d_pOutBuf.height = numOutChannels; prms_2d_pOutBuf.dest_pitch = outChPitch * sizeof(Tacc); prms_2d_pOutBuf.src_pitch = outChPitch * sizeof(Tacc); prms_2d_pOutBuf.dest_addr = Udma_appVirtToPhyFxn((void *)accPtr, 1, NULL); //Udma_appVirtToPhyFxn prms_2d_pOutBuf.src_addr = Udma_appVirtToPhyFxn((void *)pOutData, 1, NULL); //Udma_appPhyToVirtFxn //prms_2d_pOutBuf.dest_addr = (uint64_t)accPtr; //prms_2d_pOutBuf.src_addr = (uint64_t)pOutData; appUdmaCopy2D(NULL, &prms_2d_pOutBuf, 1); appMemCacheInv(accPtr, numOutChannels * outChPitch * sizeof(Tacc)); }
This is the code the customer used to call the function using UDMA to carry the data on C66. gPtrL2RAM is the base address of L2 (0x00800000).
Now the data transport results are correct, but there is no performance improvement; Are they using udMA correctly?
Could you help check this case? Thanks.
Best Regards,
Cherry