Hi,
we are using BIOS PSP release version 1.10.03 to implement some video processing algorithms on DM648 board. We used the video sample in pspdrivers_1_10_03\packages\ti\sdo\pspdrivers\system\dm648\bios\evmDM648\video\sample\build\hd_loopback as a starting point and added a La Place filtering function for the video data (see below). When loading this application on the target we observe a CPU load of about 76 % (in CPU load graph) while we are expecting it to be around 55 % (calculated from the cycle count displayed in Compiler Consultant).
What is the reason of this high overhead? How can we prevent it?
To make our application clearer for you, I will explain it in depth.
The data is previously DMA-copied from Capture-Buffer in DDR2-RAM to IRAM. Here is the declaration of the global row buffers:
#define LINES_PROCESSED_Y (12)
#pragma DATA_SECTION(chCapLine_y1,
".UPSCALING_BUFFER")
Uint8
chCapLine_y1[(LINES_PROCESSED_Y+2)*DIS_NUMPIXELS];
#pragma
DATA_SECTION(chCapLine_y2, ".UPSCALING_BUFFER")
Uint8
chCapLine_y2[(LINES_PROCESSED_Y+2)*DIS_NUMPIXELS];
#pragma DATA_SECTION(chDisLine_y1, ".UPSCALING_BUFFER");
Uint8
chDisLine_y1[LINES_PROCESSED_Y*DIS_NUMPIXELS];
#pragma
DATA_SECTION(chDisLine_y2, ".UPSCALING_BUFFER");
Uint8
chDisLine_y2[LINES_PROCESSED_Y*DIS_NUMPIXELS];
The La Place filtering function is invoked in test_hd_loopback():
// snippet start
/* Give the old capture frame buffer back to driver and get the
recently captured frame buffer */
status = FVID_exchange(capChInfo.chanHandle,
&(capChInfo.frame));
if (IOM_COMPLETED != status) {
//LOG_printf(&trace, "Failed to exchange buffers
capture\n");
break;
}
else
{
for (i = 0; i <= 2*DIS_NUMLINES/(2*LINES_PROCESSED_Y); i++)
{
// double buffering:
// first capture buffer is fetched and first display buffer is
transmitted (async)
// second buffer is processed
by CPU
waitId[0] =
DAT_copy2d(DAT_2D1D,
capChInfo.frame->frame.iFrm.y1
+
(2*LINES_PROCESSED_Y*i*DIS_NUMPIXELS),
chCapLine_y1,
DIS_NUMPIXELS,
LINES_PROCESSED_Y+1+2,
DIS_NUMPIXELS);
if (i >
0)
{
waitId[0] =
DAT_copy2d(DAT_2D1D,
chDisLine_y1,
disChInfo.frame->frame.iFrm.y1 +
(2*LINES_PROCESSED_Y*(i-1)*DIS_NUMPIXELS),
DIS_NUMPIXELS,
LINES_PROCESSED_Y,
DIS_NUMPIXELS);
}
laPlaceFiltering(chCapLine_y2,
chDisLine_y2,
(2*LINES_PROCESSED_Y*(i-1)+LINES_PROCESSED_Y));
// second buffer is fetched and second display buffer is transmitted
(async)
// first buffer is processed by
CPU
waitId[0] =
DAT_copy2d(DAT_2D1D,
capChInfo.frame->frame.iFrm.y1
+
((2*LINES_PROCESSED_Y*i+LINES_PROCESSED_Y)*DIS_NUMPIXELS),
chCapLine_y2,
DIS_NUMPIXELS,
LINES_PROCESSED_Y+1+2,
DIS_NUMPIXELS);
if (i >
0)
{
waitId[0] =
DAT_copy2d(DAT_2D1D,
chDisLine_y2,
disChInfo.frame->frame.iFrm.y1 +
(2*LINES_PROCESSED_Y*
(i-1)*DIS_NUMPIXELS+LINES_PROCESSED_Y*DIS_NUMPIXELS),
DIS_NUMPIXELS,
LINES_PROCESSED_Y,
DIS_NUMPIXELS);
}
laPlaceFiltering(chCapLine_y1,
chDisLine_y1,
2*LINES_PROCESSED_Y*i);
}
// snippet end
Here is the la place filtering function:
/*
* ======== laPlaceFiltering
========
* pchCapBuffer - Capture
line buffer pointer
* pchDisBuffer -
Display line buffer pointer
*/
static void
laPlaceFiltering(Uint8 *restrict pchCapBuffer,
Uint8
*restrict pchDisBuffer,
Uint16 uwRowCnt)
{
Int j, k, l;
unsigned long long
ullCapPixelPrevLine;
unsigned long long
ullCapPixelCurrLine;
unsigned long long
ullCapPixelNextLine;
Uint32 ulPixelBufPrevLine;
Uint32
ulPixelBufCurrLine;
Uint32
ulPixelBufNextLine;
Int32 slLaplaceSum1,
slLaplaceSum2, slLaplaceSum3;
Int16
swLaplaceResult;
Int32 slLowPassFilterBuf1
= 0, slLowPassFilterBuf2 = 0, slLowPassFilterBuf3 = 0;
Int32 slLowPassSum1, slLowPassSum2,slLowPassSum3;
Int32 slLowPassResult;
Uint8
chCurrPixel;
Int16 swResult;
Uint8 chDisPixel[6];
for (k = 0; k <
(LINES_PROCESSED_Y+1); k++)
{
//#######################################################################
// first and last row calculated different
//#######################################################################
if (((uwRowCnt == 0) && (k == 0)) ||
((uwRowCnt
== DIS_NUMLINES-LINES_PROCESSED_Y) && (k == LINES_PROCESSED_Y
- 1)))
{
// data unchanged in first
row
for
(j = 0; j < DIS_NUMPIXELS; j++)
{
pchDisBuffer[k*DIS_NUMPIXELS+j]
= pchCapBuffer[k*DIS_NUMPIXELS+j];
}
}
else
{
//#######################################################################
// first column calculated
different ("Randproblem")
//#######################################################################
pchDisBuffer[k*DIS_NUMPIXELS] =
pchCapBuffer[k*DIS_NUMPIXELS];
for (j =
0; j < DIS_NUMPIXELS; j=j+6)
{
// 6 Pixel are calculated in a loop cycle
//########################################################
// get
captured data and save in temporary register
//########################################################
ullCapPixelPrevLine = _mem8(&pchCapBuffer[k*DIS_NUMPIXELS+j]);
ullCapPixelCurrLine =
_mem8(&pchCapBuffer[k*DIS_NUMPIXELS+j+DIS_NUMPIXELS]);
ullCapPixelNextLine =
_mem8(&pchCapBuffer[k*DIS_NUMPIXELS+j+2*DIS_NUMPIXELS]);
//########################################################
// calculate
first four pixel
// (because shifting input buffer about more than 4
bytes is not possible)
//########################################################
for (l = 0;
l < 4; l++)
{
//########################################################
// pack captured data in temporary pixel buffer:
//########################################################
ulPixelBufPrevLine = (Uint32)(ullCapPixelPrevLine >> 8*l);
ulPixelBufCurrLine = (Uint32)(ullCapPixelCurrLine >> 8*l);
ulPixelBufNextLine = (Uint32)(ullCapPixelNextLine >> 8*l);
//########################################################
// la place filter
//########################################################
slLaplaceSum1 = _dotpsu4(slLaplaceFactorBuf1,
ulPixelBufPrevLine);
slLaplaceSum2 =
_dotpsu4(slLaplaceFactorBuf2, ulPixelBufCurrLine);
slLaplaceSum3 = _dotpsu4(slLaplaceFactorBuf3,
ulPixelBufNextLine);
swLaplaceResult = slLaplaceSum1
+ slLaplaceSum2 + slLaplaceSum3;
//########################################################
// normalization
//########################################################
chCurrPixel = (ulPixelBufCurrLine >> 8);
swResult = (gCfgReg[I2C_CONTOUR_FACTOR_REGADDR]*swLaplaceResult /
256);
if (swResult < 0)
// < 0
chDisPixel[l] = 0;
else if
(swResult > 255) // > 255
chDisPixel[l] = 255;
else
chDisPixel[l] = swResult;
}
//########################################################
// calculate
next two pixel
// (because shifting input buffer about more than 4
bytes is not possible)
//########################################################
for (l = 0;
l < 2; l++)
{
ulPixelBufPrevLine = (Uint32)(_hill(ullCapPixelPrevLine) >>
8*l);
ulPixelBufCurrLine = (Uint32)(_hill(ullCapPixelCurrLine) >>
8*l);
ulPixelBufNextLine =
(Uint32)(_hill(ullCapPixelNextLine) >> 8*l);
..
// code removed (these pixels are calculated like the ones above)
..
}
//########################################################
// save
data
//########################################################
_mem4(&pchDisBuffer[k*DIS_NUMPIXELS+j+1]) =
_amem4(&chDisPixel[0]);
_mem2(&pchDisBuffer[k*DIS_NUMPIXELS+j+1+4]) =
_amem2(&chDisPixel[4]);
}
//#######################################################################
// last column calculated
different
//#######################################################################
pchDisBuffer[k*DIS_NUMPIXELS+DIS_NUMPIXELS-1] =
pchCapBuffer[k*DIS_NUMPIXELS+DIS_NUMPIXELS-1];
}
}
}
The DAT_copy2d functions were optimized like described in this post: http://e2e.ti.com/support/dsp/tms320c6000_high_performance_dsps/f/112/t/75569.aspx
Compiler Consultant says that the inner loop in the function laPlaceFiltering will take 24 cycles for execution. So because 6 pixels are calculated in this loop and our input frame is 1920x1080x2 bytes (with 30 frames per second) we are expecting the algorithm to take 1920x1080x2x30x4 = 498 Mio. cycles which results in a CPU load of about 55 % (our processor is running at 900 Mhz).
So how can we reduce this overhead?
Thanks and best regards,
Lars