This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Optimization of simple function using linear assembly

Hello

I have tried to write a simple function that adds with saturation two vectors and saves the result in the 3rd vector (because the current version in C wasn't fast enough). So far the attached code performs at ~0.5 cycles/element. I have a feeling it could be faster. Could you suggest something? I have compiled it with -mh24 and -O3 and have no further ideas. The DSP is c6400.


    .sect ".text:_func"
    .global    _func
        
   
_func:    .cproc    A4, B4, A6, B6
        .reg    vec_a, vec_b, vec_out, count, ah:al, bh:bl, ch:cl
        .no_mdep
       
            mv    A4, vec_a
            mv    B4, vec_b
            mv    A6, count
            mv    B6, vec_out
       
       
            shr    count, 3, count
           
loop:        .trip    2, 0x20000000, 2
            lddw    *vec_a++, ah:al
            lddw    *vec_b++, bh:bl
            saddu4    ah, bh, ch
            saddu4    al, bl, cl
            stdw    ch:cl, *vec_out++
    [count]    sub        count, 1, count
    [count]    b        loop
   
            .return
            .endpro
c

 

Here is the C++ equivalent. But it runs only ~4cycles/element, which is really slow. Something's definitely wrong here. Could you suggest any modifications here as well?

    inline void IntrinsicSADD
    (
        const uint8    * restrict inVector1,
        const uint8    * restrict inVector2,
        const int      inLength,
        uint8        * restrict outVector
    )
    {
        for(int i = 0; i < inLength; ++i)
        {
            uint32 out_hi = _saddu4( _hi( _amemd8_const( &inVector1[i] ) ), _hi( _amemd8_const( &inVector2[i] ) ) );
            uint32 out_lo = _saddu4( _lo( _amemd8_const( &inVector1[i] ) ), _lo( _amemd8_const( &inVector2[i] ) ) );
            _amemd8( &outVector[i] ) = _itod( out_hi, out_lo ); \
        }
    }

 

  • What is the target processor?
    What version of Code Generation tools are you using?
    Have you enabled assembly listing files and optimization comments?
    What is the full command line that you are using for the compiler and for the assembler?
    What are your constraints on inLength/count?

  • The target processor is c6414 (so I use -mv6400)

    The CGT version is 5.1.12 (I can not use a newer one, because the DSP is part of other device which does not work well with programs compiled by newer CGT)

    inLength will be a multiple of 16.

    The command line is (prog.cpp is just a simple program that calls the function to test its speed; the changing of entry point is required by the producer of the device with this DSP)

    cl6x -dCHIP_6414 -O3 -mh24  -s -mv6400 prog.cpp func.sa -z -u _c_int01 Sample.cmd -o prog.out

     

    The Sample.cmd looks as follows:

    -i C:\TI\C6000\CGTOOLS\LIB
    -ar
    --disable_auto_rts
    -l C:\C6xCSL\lib_3x\csl6414.lib
    -l vcrt4.lib
    -l vclib.lib
    -l flib.lib
    -l C:\CGT\lib\rts6400.lib
    -u _c_int01
    -e _c_int01
    -stack 0x400000 /* adjust appropriate - stack size: minimum = 0x4000 maximum = depends on camera max mem */
    -heap  0x400000  /* adjust appropriate - heap size : minimum = 0x400  maximum = depends on camera max mem */

    MEMORY
    {
        PMEM:   o = 0h       l = 0ffffffffh
    }

    SECTIONS
    {
    .text  : ALIGN(32) { *(.text) } > PMEM
    .const : ALIGN(8) {} > PMEM
    .data  : ALIGN(8) {} > PMEM
    .bss   : ALIGN(8) { *(.bss)   } > PMEM
    .cinit : ALIGN(8) { *(.cinit) } > PMEM /* cflag option only */
    .pinit : ALIGN(4) {} > PMEM /* cflag option only */
    .stack : ALIGN(8) {} > PMEM /* cflag option only */
    .far   : ALIGN(8) {} > PMEM /* cflag option only */
    .sysmem: ALIGN(8) {} > PMEM /* cflag option only */
    .switch: ALIGN(4) {} > PMEM /* cflag option only */
    .cio   : ALIGN(8) {} > PMEM /* cflag option only */
    }