This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

TDA4VM: C66

Part Number: TDA4VM

Hi team,

Here's an issue from the customer may need your help:

if ((coeffsHeight == 3) && (dilationHeight== 1) && (typeid(Tin) == typeid(uint8_t)) && height/strideHeight >= 8)
    {
      printf("Sy-gPtrL2RAM:%p--------------------------------20220303_1\n", gPtrL2RAM); //gPtrL2RAM:0x00800000
      uint32_t uiMemUsedSize = 0;

      //Input
      Tin  *pInData = (Tin *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);   
      uiMemUsedSize += (uint32_t)(numInChannels * inChPitch * sizeof(Tin));

      appMemCacheWb(pInChannel, numInChannels * inChPitch * sizeof(Tin));
      appMemCacheWb(pInData, numInChannels * inChPitch * sizeof(Tin));
      
      app_udma_copy_2d_prms_t prms_2d_pInBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pInBuf);
      prms_2d_pInBuf.width        = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.height       = numInChannels;
      prms_2d_pInBuf.dest_pitch   = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.src_pitch    = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pInData, 1, NULL);  //Udma_appVirtToPhyFxn
      prms_2d_pInBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pInChannel, 1, NULL);  //appMemGetVirt2PhyBufPtr 
      //prms_2d_pInBuf.dest_addr    = (uint64_t)pInData;
      //prms_2d_pInBuf.src_addr     = (uint64_t)pInChannel;  
      appUdmaCopy2D(NULL, &prms_2d_pInBuf, 1);

      appMemCacheInv(pInData, numInChannels * inChPitch * sizeof(Tin));

      //Weight
      Tw   *pWeightData = (Tw *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      uiMemUsedSize += (uint32_t)(numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      appMemCacheWb(pCoeffs, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
      appMemCacheWb(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      app_udma_copy_2d_prms_t prms_2d_pWeightBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pWeightBuf);
      prms_2d_pWeightBuf.width        = coeffsHeight * coeffsWidth * sizeof(Tw);
      prms_2d_pWeightBuf.height       = numOutChannels * numInChannels;
      prms_2d_pWeightBuf.dest_pitch   = coeffsHeight * coeffsWidth * sizeof(Tw);
      prms_2d_pWeightBuf.src_pitch    = coeffsHeight * coeffsWidth * sizeof(Tw);  
      prms_2d_pWeightBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pWeightData, 1, NULL); 
      prms_2d_pWeightBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pCoeffs, 1, NULL);  
      //prms_2d_pWeightBuf.dest_addr    = (uint64_t)pWeightData; 
      //prms_2d_pWeightBuf.src_addr     = (uint64_t)pCoeffs;
      appUdmaCopy2D(NULL, &prms_2d_pWeightBuf, 1);   

      appMemCacheInv(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      //Bias
      Tb   *pBiasData = (Tb *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      uiMemUsedSize += (uint32_t)(numOutChannels * sizeof(Tb));

      appMemCacheWb(pBias, numOutChannels * sizeof(Tb));
      appMemCacheWb(pBiasData, numOutChannels * sizeof(Tb));

      app_udma_copy_1d_prms_t prms_1d_pBaisBuf;
      appUdmaCopy1DPrms_Init(&prms_1d_pBaisBuf);
      prms_1d_pBaisBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pBiasData, 1, NULL); 
      prms_1d_pBaisBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pBias, 1, NULL); 
      //prms_1d_pBaisBuf.dest_addr    = (uint64_t)pBiasData; 
      //prms_1d_pBaisBuf.src_addr     = (uint64_t)pBias;
      prms_1d_pBaisBuf.length       = numOutChannels * sizeof(Tb); 
      appUdmaCopy1D(NULL, &prms_1d_pBaisBuf); 

      appMemCacheInv(pBiasData, numOutChannels * sizeof(Tb));       

      Tacc  *pOutData = (Tacc *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      memset(pOutData, 0, numOutChannels * outChPitch * sizeof(Tacc));
      uiMemUsedSize += (uint32_t)(numOutChannels * outChPitch * sizeof(Tacc));
    
      ullCyclesStart = get_tsc();

      TIDL_refConv2dKernel_i8u_c8s_o32s_3x3s1d1(pInData, pWeightData, pBiasData, pOutData , &min, &max, numTotRoi, numGroups, numInChannels,
        numOutChannels, inChPitch, outChPitch, width, height, inImPitch, outImPitch,
        coeffsWidth, coeffsHeight, dilationWidth, dilationHeight, strideWidth, strideHeight, params->enableBias);
      
      ullCyclesEnd = get_tsc();   
      
      printf("Sy-Conv cost cycles %llu\n", (ullCyclesEnd - ullCyclesStart));

      appMemCacheWb(pOutData, numOutChannels * outChPitch * sizeof(Tacc));
      appMemCacheWb(accPtr, numOutChannels * outChPitch * sizeof(Tacc));

      app_udma_copy_2d_prms_t prms_2d_pOutBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pOutBuf);
      prms_2d_pOutBuf.width        = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.height       = numOutChannels;
      prms_2d_pOutBuf.dest_pitch   = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.src_pitch    = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)accPtr, 1, NULL);  //Udma_appVirtToPhyFxn
      prms_2d_pOutBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pOutData, 1, NULL);  //Udma_appPhyToVirtFxn
      //prms_2d_pOutBuf.dest_addr    = (uint64_t)accPtr;
      //prms_2d_pOutBuf.src_addr     = (uint64_t)pOutData;  
      appUdmaCopy2D(NULL, &prms_2d_pOutBuf, 1);

      appMemCacheInv(accPtr, numOutChannels * outChPitch * sizeof(Tacc));  
    } 

This is the code the customer used to call the function using UDMA to carry the data  on C66. gPtrL2RAM is the base address of L2 (0x00800000).

Now the data transport results are correct, but there is no performance improvement; Are they using udMA correctly?

Could you help check this case? Thanks.

Best Regards,

Cherry

  • Hi,

    May I know is there any update?

    Thanks and regards,

    Cherry

  • Cherry, 

    sorry for the delayed response. can you confirm:

    1. what is the address of pCoeffs? Is it also in C66 L2?

    2. can you confirm how you verified data was right? You used 0x00800000 for gPtrL2RAM, that is the C66 view. UDMA need to view from SOC side, which should be 0x4D80800000. So I am not sure how the data movement could be correct. 

    if you are using UDMA to move data within the C66 L2, the UDMA may not get optimal bandwidht. 

    regards

    Jian