dct on vicp (dm6446)

liu Andy

Hi all

I have used vicplib_v300 and tested some examples like sum and tableLookup, they worked correctly.

now I want to use vicp to compute dct, so i write my test file, but its result is wrong.

In vicplib_v300\test\src, i wrote dct.c

/* Standard header file */
#include <std.h>
#include <stdlib.h>
#include <stdio.h>
#include <tistdtypes.h>

/* DSP-BIOS header files */
#include <hwi.h>
#include <mem.h>
#include <clk.h>
#include <log.h>
#include "platform_support.h"

/* VICP Signal Processing Library header files */
#include "vicplib.h"
#include "gpp_vicplib.h"
#include "testParams.h"

/* Local defines for the test file */
//#define BLOCK_WIDTH 136
//#define BLOCK_HEIGHT 30
#define BLOCK_WIDTH 8
#define BLOCK_HEIGHT 8
#define NUM_BLOCK_HORIZ 10
#define NUM_BLOCK_VERT 6

#define WIDTH (BLOCK_WIDTH*NUM_BLOCK_HORIZ)
#define HEIGHT (BLOCK_HEIGHT*NUM_BLOCK_VERT)

/* Global pointers used by the test application */
char *SRC;
char *DST;

/*
    Flag used to synchronize between the test applicaton and the VICP
    VICP signals the completion of requested procesisng by driving the
    CPU interrupt. In this example code, the ISR only sets the below flag.
    The flag is polled by the example code to check for the completion.
    In actual application, user can chose to drive Semaphores in the ISR.
*/

/* Set the flag to 0 to start with */
volatile Uint16 VICPIntFlag = 0;
/*
    The below fxn is used as VICP ISR.
    This is configured using the ECM module in the test code
*/
void CPIS_isr() {
    VICPIntFlag= 1;
}

/*
    The VICP Lib API CPIS_wait allows the user to specify a wait function.
    In the below wait function used by the example code, we only pole for VICPIntFlag.
    In actual application, this should be replaced with Semaphore.
*/
void CPIS_wait_Fxn(void *arg) {
    /* Ensure that the VICP procesing is done */
    while (VICPIntFlag==0);
        VICPIntFlag= 0;
    /* Ensure the VICP Lib indicates processing done */
    while(CPIS_isBusy());
        return;
}

/* Main Fxn */
void main () {
    printf("main\n");
}

static short testdata[64] =
{
    -59, -59, -59, -59, -59, -57, -57, -57,
    -61, -61, -59, -59, -59, -59, -59, -59,
    -61, -61, -61, -61, -61, -61, -59, -59,
    -63, -63, -63, -63, -63, -61, -61, -61,
    -64, -63, -63, -63, -63, -63, -63, -63,
    -64, -64, -63, -63, -63, -63, -63, -63,
    -63, -63, -63, -63, -63, -63, -60, -60,
    -63, -63, -63, -63, -63, -60, -60, -60
};

/*
    The main processing task that demonstrates the usage of the VICP LIB API
*/
void procFunction(){

    /* Parameter structures required to interface with the lib */
    CPIS_Init vicpInit;
    CPIS_BaseParms base;
    CPIS_dctParms params;
    CPIS_Handle handle;

    /* Local variables */
    Int32 i;
//    Int32 retVal;
    unsigned long setupTimerDiff, resetTimerDiff, execTimerDiff;
    unsigned long timerStart, timerEnd;

    memInit();
    timerInit();

    /* Allocate the memory needed for the buffers */

    SRC = (char*)memAlloc( 4*WIDTH*HEIGHT);
    DST = SRC + 2*WIDTH*HEIGHT;

    /* Set up the ISR for the VICP, function defined in platform_supportXXXX.c */
    intSetup(CPIS_isr);
    /* Enable interrupt for the VICP, function defined in platform_supportXXXX.c */
    intEnable();

    /* Initialize the library */
    vicpInit.cacheWbInv = (Cache_wbInv) cacheWb;
    vicpInit.staticDmaAlloc= 1;
    vicpInit.maxNumProcFunc= 1;
    vicpInit.memSize= CPIS_getMemSize(vicpInit.maxNumProcFunc);
    vicpInit.mem= memAlloc(vicpInit.memSize);

    if (CPIS_init(&vicpInit)== -1) {
        printf("\nCPIS_init error\n");
        exit(-1);
    };

    /* Initialize the call back function that the library will use to synchronize */
    CPIS_setWaitCB(&CPIS_wait_Fxn);

    /* Initialize the test param generation module */

    base.srcFormat[0] = CPIS_16BIT;
    base.dstFormat[0] = CPIS_16BIT;
    params.scalarFormat = CPIS_16BIT;
    params.qShift = 0;
    params.sat_sign = 0;
    params.sat_high = 32767;
    params.sat_high_set = 32767;
    params.sat_low = -32768;
    params.sat_low_set = -32767;

    printf("Dct Testing starts ...\n");


        /*
            Initialize the remianing fields of the base params and the
            module parameter structure
        */
        base.srcBuf[0].ptr= (Uint8*)SRC;
        base.srcBuf[0].stride= WIDTH;

        base.dstBuf[0].ptr= (Uint8*)DST;
        base.dstBuf[0].stride= WIDTH;

        base.roiSize.width= WIDTH ;
        base.roiSize.height= HEIGHT ;
        base.procBlockSize.width= BLOCK_WIDTH;
        base.procBlockSize.height= BLOCK_HEIGHT;

        /* Fill input with random data */
        /*just fill the data at left, top of the matrix for test*/
        for( i=0 ;i < 8; i++)
        {
            memcpy(SRC + i*160, testdata + i* 8, 16);
        }


        /* Zero out dest buffer. */
        for (i=0;i<WIDTH*HEIGHT*2;i++){
            *(base.dstBuf[0].ptr+i)= 0;
        }

        /* Clear the cache */
        cacheWbInvAll ();

        /* Benchmark the setup time */
        timerStart= timerReadStart ();

        /*
            Call the module in asynchronous mode. This means only setup of
            hardware is done. Execution will have to be triggered by CPIS_start
        */
        if (CPIS_dct(
            &handle,
            &base,
            &params,
            CPIS_ASYNC
            )== -1) {
            printf("\nCPIS_dct() error %d\n", CPIS_errno);
            exit(-1);
        };
        timerEnd= timerReadEnd ();
        setupTimerDiff= timerEnd-timerStart;

        /* Benchmark the actual processing */
        timerStart= timerReadStart ();

        /* Trigger the start of processing */
        CPIS_start(handle);
        CPIS_wait(handle);

        timerEnd= timerReadEnd ();
        execTimerDiff= timerEnd-timerStart;

        /*
            To test multiple start/wait sequence, we rerun the processing
            we need to reset the function otherwise second run won't be correct
        */
        timerStart= timerReadStart ();
        CPIS_reset(handle);
        timerEnd= timerReadEnd ();
        resetTimerDiff= timerEnd-timerStart;

        CPIS_start(handle);
        CPIS_wait(handle);

        /* Delete the function and its associated set of parameters once done */
        CPIS_delete(handle);
        /* Clear the cache */
        cacheWbInvAll ();

        printf("Setup Time = %ld, Reset Time = %ld, Execution Time = %ld !\n", setupTimerDiff, resetTimerDiff, execTimerDiff);

    /* Close the module and free the resources */

    CPIS_deInit();
    intDisable();

    printf("Testing complete !\n");

    return;
}

In imgproclib.c I added

Int32 CPIS_dct(
CPIS_Handle *handle,
CPIS_BaseParms *base,
CPIS_dctParms *params,
CPIS_ExecType execType
){
CPIS_FuncStruct func;

base->numInput= 1;
base->numOutput= 1;
func.checkFunc= &_CPIS_checkDctParams;
func.resetFunc= NULL;

/* use default function _CPIS_setDmaInTransfers() in _imgproclib.c*/
func.setDmaInFunc= NULL;
func.procFunc= &_CPIS_setDctProcessing;

/* use default function _CPIS_setDmaOutTransfers() in _imgproclib.c*/
func.setDmaOutFunc= NULL;
return (_CPIS_genericCall(handle, base, (void*)params, execType, \
                              &func));
}

In vicplib_v300\src\src_hw, I added file _dct.c

/* Include the lib interface header files */
#include "vicp_support.h"
#include "vicplib.h"
#include "_vicplib.h"
#include "vicp_sch.h"
#include "vicp_comp.h"

Int32 _CPIS_checkDctParams(
CPIS_BaseParms *base,
void *p){

CPIS_dctParms *params;
Uint16 srcFormat= base->srcFormat[0];
Uint16 dstFormat= base->dstFormat[0];
Uint16 scalarFormat;

params= (CPIS_dctParms *)p;
scalarFormat = params->scalarFormat;

if ((srcFormat != CPIS_16BIT) && (srcFormat != CPIS_8BIT) &&
      (srcFormat != CPIS_U16BIT) && (srcFormat != CPIS_U8BIT)
     ) {
   CPIS_errno= CPIS_NOSUPPORTFORMAT_ERROR;
   return -1;
}

if ((scalarFormat != CPIS_16BIT) && (scalarFormat != CPIS_8BIT) &&
      (scalarFormat != CPIS_U16BIT) && (scalarFormat != CPIS_U8BIT)
     ) {
   CPIS_errno= CPIS_NOSUPPORTFORMAT_ERROR;
   return -1;
}

if ((dstFormat != CPIS_16BIT) && (dstFormat != CPIS_8BIT)) {
   CPIS_errno= CPIS_NOSUPPORTFORMAT_ERROR;
   return -1;
}

return 0;

}

Int32 _CPIS_setDctProcessing(
CPIS_IpRun *ipRun,
CPIS_BaseParms *base,
void *p){

Uint16 typeSrc, typeDst, typeScalar;
CPIS_Info info;
CPIS_dctParms *params;
int i;

params= (CPIS_dctParms *)p;

info.imgbufptr= IMGBUF_A_BASE + ipRun->imgbufInOfst;
info.imgbuflen= ipRun->imgbufLen;
info.cmdptr= (Int16*) (CMDBUF_BASE + ipRun->cmdOfst);
info.cmdlen= 0;
info.coefptr= (Int16*) (COEFFBUF_BASE + ipRun->coefOfst);
info.coeflen= 0;
info.procBlockSize= base->procBlockSize;

IMGBUF_switch(SELALLBUF, ALLBUFDSP);

if (base->srcFormat[0] == CPIS_8BIT) {
    typeSrc = IMXTYPE_BYTE;
}
else if (base->srcFormat[0] == CPIS_U8BIT) {
    typeSrc = IMXTYPE_UBYTE;
}
else if (base->srcFormat[0] == CPIS_16BIT) {
    typeSrc = IMXTYPE_SHORT;
}
else if (base->srcFormat[0] == CPIS_U16BIT) {
    typeSrc = IMXTYPE_USHORT;
}

if (params->scalarFormat == CPIS_8BIT) {
    typeScalar = IMXTYPE_BYTE;
}
else if (params->scalarFormat == CPIS_U8BIT) {
    typeScalar = IMXTYPE_UBYTE;
}
else if (params->scalarFormat == CPIS_16BIT) {
    typeScalar = IMXTYPE_SHORT;
}
else if (params->scalarFormat == CPIS_U16BIT) {
    typeScalar = IMXTYPE_USHORT;
}

if (base->dstFormat[0] == CPIS_8BIT) {
    typeDst = IMXOTYPE_BYTE;
}
else if (base->dstFormat[0] == CPIS_16BIT) {
    typeDst = IMXOTYPE_SHORT;
}

info.cmdlen += imxenc_set_parameters_dav(
                                            params->sat_sign, params->sat_high, params->sat_high_set,
                                            params->sat_low, params->sat_low_set, info.cmdptr + info.cmdlen
                                          );

    /* fill the coefficient matrix with 1*/
    for(i = 0; i< 64; i++)
        info.coefptr[i] = 1;
    info.coeflen += 64;

    info.cmdlen += imxenc_dct8x8row(
    (Int16*)info.imgbufptr,         /* 0x11100000*/
    info.coefptr,                    /* 0x11108000*/
    (Int16*)(info.imgbufptr + ipRun->imgbufLen), /* 0x11100000 + 64*2 */
    base->procBlockSize.width , /* 8 */
    base->procBlockSize.height , /* 8 */
    base->procBlockSize.width , /* 8 */
    base->procBlockSize.height , /* 8 */
    1,
    1,
    typeSrc,                /* IMXTYPE_SHORT */
    typeScalar,                /* IMXTYPE_SHORT*/
    typeDst,                /*IMXOTYPE_SHORT */
    params->qShift,            /* 0 */
    info.cmdptr + info.cmdlen
    );

info.cmdlen+= imxenc_sleep(info.cmdptr + info.cmdlen);

ipRun->imgbufLen= info.imgbuflen; /* info.imgbuflen is in number of bytes */
ipRun->cmdLen= info.cmdlen<<1; /* info.cmdlen is in number of words */
ipRun->coefLen= info.coeflen<<1; /* info.coeflen is in number of words */

ipRun->imgbufOutOfst[0]= ipRun->imgbufInOfst + ipRun->imgbufLen; /* 0 + 128 */

return 0;
}

so my input is

static short testdata[64] =
{
    -59, -59, -59, -59, -59, -57, -57, -57,
    -61, -61, -59, -59, -59, -59, -59, -59,
    -61, -61, -61, -61, -61, -61, -59, -59,
    -63, -63, -63, -63, -63, -61, -61, -61,
    -64, -63, -63, -63, -63, -63, -63, -63,
    -64, -64, -63, -63, -63, -63, -63, -63,
    -63, -63, -63, -63, -63, -63, -60, -60,
    -63, -63, -63, -63, -63, -60, -60, -60
};

,after 1-D dct it is supposed to be
{
    -165, -2, 1, 0, -1, 0, 0, 0,
    -168, -2, -1, -1, 0, 0, 1, 0,
    -171, -2, 1, -1, 0, 0, -1, 0,
    -176, -2, 1, 0,-1, 0, 0, 0,
    -179, 0, 0, 0, 0, 0, 0, 0,
    -179, -1, -1, 0, 0, 0, 0, 0,
    -176, -3, 2, -1, 0, 1, -1, 0,
    -175, -4, 1, 1, -1, 0, 1, -1
}
but the result is
{
    -466, -466, -466, -466, -466, -466, -466, -466,
    -476, -476, -476, -476, -476, -476, -476, -476,
    -484, -484, -484, -484, -484, -484, -484, -484,
    -498, -498, -498, -498, -498, -498, -498, -498,
    -505, -505, -505, -505, -505, -505, -505, -505,
    -506, -506, -506, -506, -506, -506, -506, -506,
    -498, -498, -498, -498, -498, -498, -498, -498,
    -495, -495, -495, -495, -495, -495, -495, -495,
}.

I am sure the input can effect the output, as if I change the second input row from
{    -61, -61, -59, -59, -59, -59, -59, -59, }
to
{    -59, -59, -59, -59, -59, -57, -57, -57, }
just like the first row, the output change from
{-476, -476, -476, -476, -476, -476, -476, -476, }
to
{-466, -466, -466, -466, -466, -466, -466, -466, }
just like the first row.

this really puzzle me.

by the way, I used EDMA3 01.06.

I'm very appreciate for your response! Thank you!

over 16 years ago

0 Victor Cheng over 16 years ago

TI__Expert 6335 points

Hello Andy,

I noticed that you are filling the coef matrix with 1 so you are not passing real DCT coefficients. Hence the function is merely doing the summation of each row.

I am suggesting this set of coefficients, which are basically DCT coefficients but multiplied by 65536, I believe.

Int16 DCTCOEF88 [64] = {
    23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170,
    32138, 27246, 18205, 6393, -6393,-18205,-27246,-32138,
    30274, 12540,-12540,-30274,-30274,-12540, 12540, 30274,
    27246, -6393,-32138,-18205, 18205, 32138, 6393,-27246,
    23170,-23170,-23170, 23170, 23170,-23170,-23170, 23170,
    18205,-32138, 6393, 27246,-27246, -6393, 32138,-18205,
    12540,-30274, 30274,-12540,-12540, 30274,-30274, 12540,
    6393,-18205, 27246,-32138, 32138,-27246, 18205, -6393 };

You will need to set params->qShift to 16 .

Let me know if this works. I will update the documentation to make it clearer.

0 liu Andy over 16 years ago

Prodigy 70 points

hi Victor Cheng.

Thanks for your suggestion. I use the

as coefficients matrix. The output is

output[64] =
{
    -155, 40, -32, 11, -18, 2, -10, -5,
    -158, 41, -35, 11, -19, 2, -10, -4,
    -161, 42, -33, 11, -18, 2, -11, -5,
    -165, 43, -34, 12, -20, 2, -11, -5,
    -167, 45, -36, 12, -20, 2, -12, -5,
    -167, 45, -37, 12, -20, 2, -11, -5,
    -166, 43, -33, 11, -19, 2, -12, -5,
    -165, 42, -33, 12, -20, 2, -10, -6
};

this is a little different from my dct output

{
    -165, -2, 1, 0, -1, 0, 0, 0,
    -168, -2, -1, -1, 0, 0, 1, 0,
    -171, -2, 1, -1, 0, 0, -1, 0,
    -176, -2, 1, 0,-1, 0, 0, 0,
    -179, 0, 0, 0, 0, 0, 0, 0,
    -179, -1, -1, 0, 0, 0, 0, 0,
    -176, -3, 2, -1, 0, 1, -1, 0,
    -175, -4, 1, 1, -1, 0, 1, -1
}.

I see the my dct C code, there is some additional calculation to make more zeros in output matrix.

it is maybe to make the following huffman encoder run faster.

0 Victor Cheng over 16 years ago in reply to liu Andy

TI__Expert 6335 points

Hello Andy,

It may be that your code has an extra quantization step. You can also perform this extra step with the VICP using imxenc_array_op() chained after the DCT. You will probably have to use Q format number to perform division with multiplication.

I am providing here the matlab equivalent for imxenc_dct8x8row() function in case you want to have visibility in what it is doing:

function outputm=imx_dct8x8row(input, coeff, input_width, input_height, output_width, output_height, calc_Hblks, calc_Vblks, input_type, coeff_type, output_type, rnd_shift)

coeff=coeff';

outputm=zeros(calc_Vblks*8,calc_Hblks*8);

for i=1:calc_Vblks,
for j=1:calc_Hblks,
    for row=1:8,
    for k=1:8,
     outputm((i-1)*8+row,(j-1)*8+k)=sum(input((i-1)*8+row,(j-1)*8+1:j*8).*coeff(k,1:8));
    end;
    end;
end;
end;

Lastly I recommend that you set BLOCK_WIDTH and BLOCK_HEIGHT as large as possible to minimize overhead. May be try BLOCK_WIDHT=40 and BLOCK_HEIGHT= 40 . You will have to pass calc_Hblks= calc_Vblkks= 5 to imxenc_dct8x8row().

Victor

0 liu Andy over 16 years ago

Prodigy 70 points

Hi Victor Cheng

I will use imxenc_array_op() in my code. Thank you for your help

0 liu Andy over 16 years ago

Prodigy 70 points

the DCTCOEF88 [64] seems need to change from
Int16 DCTCOEF88 [64] = {
    23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170,
    32138, 27246, 18205, 6393, -6393,-18205,-27246,-32138,
    30274, 12540,-12540,-30274,-30274,-12540, 12540, 30274,
    27246, -6393,-32138,-18205, 18205, 32138, 6393,-27246,
    23170,-23170,-23170, 23170, 23170,-23170,-23170, 23170,
    18205,-32138, 6393, 27246,-27246, -6393, 32138,-18205,
    12540,-30274, 30274,-12540,-12540, 30274,-30274, 12540,
     6393,-18205, 27246,-32138, 32138,-27246, 18205, -6393 };

to

Int16 DCTCOEF88 [64] = {
    23170, 32138, 30274, 27246, 23170, 18205, 12540, 6393,
    23170, 27246, 12540, -6393,-23170,-32138,-30274,-18205,
    23170, 18205,-12540,-32138,-23170, 6393, 30274, 27246,
    23170, 6393,-30274,-18205, 23170, 27246,-12540,-32138,
    23170, -6393,-30274, 18205, 23170,-27246,-12540, 32138,
    23170,-18205,-12540, 32138,-23170, -6393, 30274,-27246,
    23170,-27246, 12540, 6393,-23170, 32138,-30274, 18205,
    23170,-32138, 30274,-27246, 23170,-18205, 12540, -6393
};

the result will change from
output[64] =
{
    -155, 40, -32, 11, -18, 2, -10, -5,
    -158, 41, -35, 11, -19, 2, -10, -4,
    -161, 42, -33, 11, -18, 2, -11, -5,
    -165, 43, -34, 12, -20, 2, -11, -5,
    -167, 45, -36, 12, -20, 2, -12, -5,
    -167, 45, -37, 12, -20, 2, -11, -5,
    -166, 43, -33, 11, -19, 2, -12, -5,
    -165, 42, -33, 12, -20, 2, -10, -6
};
to
output[64]
{
    -165, -2, 1, 0, -1, 0, 0, 0,
    -168, -2,-1,-1, 0, 0, 1, 0,
    -171, -2, 1,-1, 0, 0,-1, 0,
    -176, -2, 1, 0, -1, 0, 0, 0,
    -179, 0, 0, 0, 0, 0, 0, 0,
    -179, -1,-1, 0, 0, 0, 0, 0,
    -176, -3, 2,-1, 0, 1,-1, 1,
    -175, -4, 1, 1, -1, 0, 1, -1
};

just as the C code dct output
{
    -165, -2, 1, 0, -1, 0, 0, 0,
    -168, -2, -1, -1, 0, 0, 1, 0,
    -171, -2, 1, -1, 0, 0, -1, 0,
    -176, -2, 1, 0,-1, 0, 0, 0,
    -179, 0, 0, 0, 0, 0, 0, 0,
    -179, -1, -1, 0, 0, 0, 0, 0,
    -176, -3, 2, -1, 0, 1, -1, 0,
    -175, -4, 1, 1, -1, 0, 1, -1
}

0 Victor Cheng over 16 years ago in reply to liu Andy

TI__Expert 6335 points

Andy,

Thank you for the correction.

By the way there is a new release 3.1 of the VICP library on TI website. It has some new functions and a simplified way to specify the saturation parameters. You can have a look at it.

Processors

Processors forum

dct on vicp (dm6446)