This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Compiler/TMS320F28377D: Compiler generates non-optimal code for vector float -> int conversion

Part Number: TMS320F28377D

Tool/software: TI C/C++ Compiler

Compiling the code snippet:

void vec_f32toi16r( const float32_t * a, int16_t * b, int num )
{
    int i = num;

    NASSERT( num > 0 );
    while ( i > 0 )
    {
        *b++ = ( int16_t )__f32toi16r( *a++ );
        i--;
    }
}

using with the command line:

"C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.3.LTS/bin/cl2000" -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O4 --opt_for_speed=5 --fp_mode=relaxed --include_path="C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.3.LTS/include" --advice:performance=all --define=_INLINE --symdebug:none --c99 --diag_warning=225 --diag_wrap=off --display_error_number --gen_func_subsections=on --sat_reassoc=on --asm_listing --src_interlist --gen_func_info_listing --gen_opt_info=1 --gen_preprocessor_listing --section_sizes=on --preproc_with_compile --preproc_dependency="cast.d"  "cast.c"

produces the assembly:

;***************************************************************
;* TMS320C2000 C/C++ Codegen                    PC v16.9.3.LTS *
;* Date/Time created: Mon Jul 31 13:30:47 2017                 *
;***************************************************************
 .compiler_opts --abi=coffabi --cla_support=cla1 --diag_wrap=off --float_support=fpu32 --hll_source=on --mem_model:code=flat --mem_model:data=large --object_format=coff --section_sizes=on --silicon_version=28 --symdebug:none --tmu_support=tmu0
 .asg XAR2, FP
; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.3.LTS\bin\opt2000.exe --gen_opt_info=1 C:\\Users\\rist-1\\AppData\\Local\\Temp\\{5E007277-930E-4FFF-89A3-E5D2264630F8} C:\\Users\\rist-1\\AppData\\Local\\Temp\\{E99D5E60-315B-4450-BAEF-20CF0C532130} --opt_info_filename=cast.nfo
; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.3.LTS\bin\ac2000.exe -@C:\\Users\\rist-1\\AppData\\Local\\Temp\\{71B923CF-4803-4321-8D4C-D9A0DD543BA3}
 .sect ".text:_vec_f32toi16r"
 .clink
 .global _vec_f32toi16r

;***************************************************************
;* FNAME: _vec_f32toi16r                FR SIZE:   0           *
;*                                                             *
;* FUNCTION ENVIRONMENT                                        *
;*                                                             *
;* FUNCTION PROPERTIES                                         *
;*                            0 Parameter,  0 Auto,  0 SOE     *
;***************************************************************

_vec_f32toi16r:
;*** 11 -----------------------    if ( num < 2 ) goto g4;
        CMPB      AL,#2                 ; [CPU_] |11|
        B         $C$L2,LT              ; [CPU_] |11|
        ; branchcc occurs ; [] |11|
;***   -----------------------    L$1 = (num>>1)-1;
;***   -----------------------    #pragma MUST_ITERATE(1, 16383, 1)
;***   -----------------------    // LOOP BELOW UNROLLED BY FACTOR(2)
;***   -----------------------    #pragma LOOP_FLAGS(4102u)
;*** -----------------------g3:
;*** 13 -----------------------    *b++ = __f32toi16r(*a++);
;*** 13 -----------------------    *b++ = __f32toi16r(*a++);
;*** 11 -----------------------    if ( (--L$1) != (-1) ) goto g3;
        MOV       AH,AL                 ; [CPU_]
        ASR       AH,1                  ; [CPU_]
        ADDB      AH,#-1                ; [CPU_]
        MOVZ      AR6,AH                ; [CPU_]
        RPTB      $C$L2,AR6             ; [CPU_] |11|
        ; repeat block starts ; []
$C$L1:   
        MOV32     R0H,*XAR4++           ; [CPU_] |13|
        MOV32     R1H,*XAR4++           ; [CPU_] |13|
        F32TOI16R R3H,R1H               ; [CPU_] |13|
        F32TOI16R R0H,R0H               ; [CPU_] |13|
        NOP       ; [CPU_]
        MOV32     XAR7,R3H              ; [CPU_] |13|
        MOV32     P,R0H                 ; [CPU_] |13|
        MOV       *XAR5++,P             ; [CPU_] |13|
        MOV       *XAR5++,AR7           ; [CPU_] |13|
        ; repeat block ends ; []
$C$L2:   
;*** -----------------------g4:
;***   -----------------------    if ( !(num&1) ) goto g6;
        TBIT      AL,#0                 ; [CPU_]
        B         $C$L3,NTC             ; [CPU_]
        ; branchcc occurs ; []
; Peeled loop iterations for unrolled loop:
;*** 13 -----------------------    *b = __f32toi16r(*a);
;*** -----------------------g6:
;***   -----------------------    return;
        MOV32     R0H,*+XAR4[0]         ; [CPU_] |13|
        F32TOI16R R0H,R0H               ; [CPU_] |13|
        NOP       ; [CPU_]
        NOP       ; [CPU_]
        MOV32     ACC,R0H               ; [CPU_] |13|
        MOV       *+XAR5[0],AL          ; [CPU_] |13|
$C$L3:   
        LRETR     ; [CPU_]
        ; return occurs ; []

My issue is that the result of F32TOI16R instruction is placed into a floating-point register, then moved to an integer register before being save to memory. Why is a MOV16 instruction not emitted to produce an inner loop of :

        MOV32     R0H,*XAR4++           ; [CPU_] |13|
        MOV32     R1H,*XAR4++           ; [CPU_] |13|
        F32TOI16R R3H,R1H               ; [CPU_] |13|
        F32TOI16R R0H,R0H               ; [CPU_] |13|
        MOV16     *XAR5++,R3H           ; [CPU_] |13|
        MOV16     *XAR5++,R0H           ; [CPU_] |13|

Thanks