This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Compiler/TMS320F28377D: Accessing the I16TOF32 instruction in C

Part Number: TMS320F28377D


Tool/software: TI C/C++ Compiler

I want to access the I16TOF32 instruction in C. A int-to-float cast results in a load to ACC to perform a sign extension from 16 to 32 bits, a move to a floating-point register, then a conversion to float using I32TOF32. Below is a snippet of assembly output by the C2000 16.9.2.LTS compiler at full optimisation level (it has unrolled the loop by a factor of 2).

MOVB XAR6,#24

MOVL XAR5,#_b
MOVL XAR4,#_a
SETC SXM
RPTB $C$L2,AR6
; repeat block starts
MOV ACC,*XAR4++
MOV32 R0H,ACC 
MOV ACC,*XAR4++
MOV32 R3H,ACC 
NOP
NOP
NOP
I32TOF32 R1H,R0H
I32TOF32 R0H,R3H
MOV32 *XAR5++,R1H
MOV32 *XAR5++,R0H
; repeat block ends
$C$L2:

This compiler-generated code is too slow for my use case (11 cycles per 2 outputs, vs 2 cycles per 2 outputs for hand-coded pipelined assembly). Is there a way to force the compiler to emit an I16TOF32 without using inline assembly (as it turns off some/most optimisations) and without resorting to re-writing my algorithms in assembly?

  • The compiler is capable of generating I16TOF32, as shown by the snippet below. I don't see why it isn't being generated for your loop, as it is clearly doing 16-bit loads. Could you please show me a compilable C test case that demonstrates the problem? Please include the complete command-line options. Also, what version of the compiler are you using? (This is not the same as the CCS version.)

    % cat foo.c
    float stof(short *x) { return *x; }
    % cl2000 --float_support=fpu32 -O0 -k
    % grep F32 foo.asm
    I16TOF32 R0H,*+XAR4[0]
  • I am using C2000 compiler v16.9.2.LTS. I am primarily interested in vector operations, hence the tight loops.

     

    % cat i16tof32_v_attempt1.c

    void i16tof32(float *y, const short *x, short num) { while (num--) *y++ = (float)*x++; }

    % "C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.2.LTS/bin/cl2000" -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O3 --opt_for_speed=5 --fp_mode=relaxed --include_path="C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.2.LTS/include" --symdebug:none --diag_warning=225 --diag_wrap=off --display_error_number -k --src_interlist "../i16tof32_v_attempt1.c"

    % cat i16_to_f32_v_attempt1.asm

    ;***************************************************************

    ;* TMS320C2000 C/C++ Codegen                    PC v16.9.2.LTS *

    ;* Date/Time created: Tue Apr 04 11:34:07 2017                 *

    ;***************************************************************

    .compiler_opts --abi=coffabi --cla_support=cla1 --diag_wrap=off --float_support=fpu32 --hll_source=on --mem_model:code=flat --mem_model:data=large --object_format=coff --silicon_version=28 --symdebug:none --tmu_support=tmu0

    .asg XAR2, FP

    ; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.2.LTS\bin\opt2000.exe C:\\Users\\rist-1\\AppData\\Local\\Temp\\040042 C:\\Users\\rist-1\\AppData\\Local\\Temp\\040044

    ; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.2.LTS\bin\ac2000.exe -@C:\\Users\\rist-1\\AppData\\Local\\Temp\\0400412

    .sect ".text"

    .clink

    .global _i16tof32_v_attempt1

    ;***************************************************************

    ;* FNAME: _i16tof32_v_attempt1          FR SIZE:   0           *

    ;*                                                             *

    ;* FUNCTION ENVIRONMENT                                        *

    ;*                                                             *

    ;* FUNCTION PROPERTIES                                         *

    ;*                            0 Parameter,  0 Auto,  0 SOE     *

    ;***************************************************************

    _i16tof32_v_attempt1:

    ;*** 3 -----------------------    if ( !num ) goto g7;

           CMPB      AL,#0                 ; [CPU_] |3|

           B         $C$L3,EQ              ; [CPU_] |3|

           ; branchcc occurs ; [] |3|

    ;*** 3 -----------------------    d$1 = num&1;

    ;*** 3 -----------------------    if ( num < 2 ) goto g5;

           AND       AH,AL,#0x0001         ; [CPU_] |3|

           CMPB      AL,#2                 ; [CPU_] |3|

           B         $C$L2,LT              ; [CPU_] |3|

           ; branchcc occurs ; [] |3|

    ;***   -----------------------    L$1 = (num>>1)-1;

    ;***   -----------------------    #pragma MUST_ITERATE(1, 16383, 1)

    ;***   -----------------------    // LOOP BELOW UNROLLED BY FACTOR(2)

    ;***   -----------------------    #pragma LOOP_FLAGS(4102u)

    ;*** -----------------------g4:

    ;*** 3 -----------------------    *y++ = (float)*x++;

    ;*** 3 -----------------------    *y++ = (float)*x++;

    ;*** 3 -----------------------    if ( (--L$1) != (-1) ) goto g4;

           ASR       AL,1                  ; [CPU_]

           ADDB      AL,#-1                ; [CPU_]

           MOVZ      AR6,AL                ; [CPU_]

           RPTB      $C$L2,AR6             ; [CPU_] |3|

           ; repeat block starts ; []

    $C$L1:    

           MOVX      TL,*XAR5++            ; [CPU_] |3|

           MOV32     R0H,XT                ; [CPU_] |3|

           MOVX      TL,*XAR5++            ; [CPU_] |3|

           MOV32     R3H,XT                ; [CPU_] |3|

           NOP       ; [CPU_]

           NOP       ; [CPU_]

           NOP       ; [CPU_]

           I32TOF32  R1H,R0H               ; [CPU_] |3|

           I32TOF32  R0H,R3H               ; [CPU_] |3|

           MOV32     *XAR4++,R1H           ; [CPU_] |3|

           MOV32     *XAR4++,R0H           ; [CPU_] |3|

           ; repeat block ends ; []

    $C$L2:    

    ;*** -----------------------g5:

    ;***   -----------------------    if ( d$1 <= 0 ) goto g7;

           CMPB      AH,#0                 ; [CPU_]

           B         $C$L3,LEQ             ; [CPU_]

           ; branchcc occurs ; []

    ; Peeled loop iterations for unrolled loop:

    ;*** 3 -----------------------    *y = (float)*x;

    ;*** -----------------------g7:

    ;***   -----------------------    return;

           I16TOF32  R0H,*+XAR5[0]         ; [CPU_] |3|

           NOP       ; [CPU_]

           MOV32     *+XAR4[0],R0H         ; [CPU_] |3|

    $C$L3:    

           LRETR     ; [CPU_]

           ; return occurs ; []

    Notice how the compiler emits the correct instruction for the possible trailing input.

    I also tested your suggestion of dereferencing the pointer within a function, with startling result

    % cat i16tof32_v_attempt2

    static inline float stof1(const short *x) { return (float)*x; }

    void i16tof32_v_attempt2(float *y, const short *x, short num) while (num--) *y++ = stof1(x++); }

    % "C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.2.LTS/bin/cl2000" -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O3 --opt_for_speed=5 --fp_mode=relaxed --include_path="C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_16.9.2.LTS/include" --symdebug:none --diag_warning=225 --diag_wrap=off --display_error_number -k --src_interlist "../i16tof32_v_attemp2.c"

    % cat i16tof32_v_attempt2.asm

    ;***************************************************************

    ;* TMS320C2000 C/C++ Codegen                    PC v16.9.2.LTS *

    ;* Date/Time created: Tue Apr 04 11:34:17 2017                 *

    ;***************************************************************

    .compiler_opts --abi=coffabi --cla_support=cla1 --diag_wrap=off --float_support=fpu32 --hll_source=on --mem_model:code=flat --mem_model:data=large --object_format=coff --silicon_version=28 --symdebug:none --tmu_support=tmu0

    .asg XAR2, FP

    ; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.2.LTS\bin\opt2000.exe C:\\Users\\rist-1\\AppData\\Local\\Temp\\093762 C:\\Users\\rist-1\\AppData\\Local\\Temp\\093764

    ; C:\ti\ccsv6\tools\compiler\ti-cgt-c2000_16.9.2.LTS\bin\ac2000.exe -@C:\\Users\\rist-1\\AppData\\Local\\Temp\\0937612

    .sect ".text"

    .clink

    .global _i16tof32_v_attempt2

    ;***************************************************************

    ;* FNAME: _i16tof32_v_attempt2          FR SIZE:   0           *

    ;*                                                             *

    ;* FUNCTION ENVIRONMENT                                        *

    ;*                                                             *

    ;* FUNCTION PROPERTIES                                         *

    ;*                            0 Parameter,  0 Auto,  0 SOE     *

    ;***************************************************************

    _i16tof32_v_attempt2:

    ;*** 5 -----------------------    if ( !num ) goto g7;

           CMPB      AL,#0                 ; [CPU_] |5|

           B         $C$L3,EQ              ; [CPU_] |5|

           ; branchcc occurs ; [] |5|

    ;*** 5 -----------------------    d$1 = num&1;

    ;*** 5 -----------------------    if ( num < 2 ) goto g5;

           AND       AH,AL,#0x0001         ; [CPU_] |5|

           CMPB      AL,#2                 ; [CPU_] |5|

           B         $C$L2,LT              ; [CPU_] |5|

           ; branchcc occurs ; [] |5|

    ;***   -----------------------    L$1 = (num>>1)-1;

    ;***   -----------------------    #pragma MUST_ITERATE(1, 16383, 1)

    ;***   -----------------------    // LOOP BELOW UNROLLED BY FACTOR(2)

    ;***   -----------------------    #pragma LOOP_FLAGS(4102u)

    ;*** -----------------------g4:

    ;*** 1 -----------------------    *y++ = (float)*x++;  // [0]

    ;*** 1 -----------------------    x = x;  // [0]

    ;*** 1 -----------------------    ++x;  // [0]

    ;*** 1 -----------------------    *y++ = (float)*x;  // [0]

    ;*** 5 -----------------------    if ( (--L$1) != (-1) ) goto g4;

           ASR       AL,1                  ; [CPU_]

           ADDB      AL,#-1                ; [CPU_]

           MOVZ      AR6,AL                ; [CPU_]

           RPTB      $C$L2,AR6             ; [CPU_] |5|

           ; repeat block starts ; []

    $C$L1:    

           MOVX      TL,*XAR5++            ; [CPU_] |1|

           MOV32     R0H,XT                ; [CPU_] |1|

           NOP       ; [CPU_]

           NOP       ; [CPU_]

           NOP       ; [CPU_]

           NOP       ; [CPU_]

           I32TOF32  R1H,R0H               ; [CPU_] |1|

           NOP       ; [CPU_]

           MOV32     *XAR4++,R1H           ; [CPU_] |1|

           MOV32     *XAR4++,R0H           ; [CPU_] |1|

           ; repeat block ends ; []

    $C$L2:    

    ;*** -----------------------g5:

    ;***   -----------------------    if ( d$1 <= 0 ) goto g7;

           CMPB      AH,#0                 ; [CPU_]

           B         $C$L3,LEQ             ; [CPU_]

           ; branchcc occurs ; []

    ; Peeled loop iterations for unrolled loop:

    ;*** 1 -----------------------    *y = (float)*x;  // [0]

    ;*** -----------------------g7:

    ;***   -----------------------    return;

           I16TOF32  R0H,*+XAR5[0]         ; [CPU_] |1|

           NOP       ; [CPU_]

           MOV32     *+XAR4[0],R0H         ; [CPU_] |1|

    $C$L3:    

           LRETR     ; [CPU_]

           ; return occurs ; []

    ;* Inlined function references:

    ;* [0] stof1

    Notice how the assembly output of the second attempt is incorrectly pushing the sign-extended integer to the output. This is a bug that I will raise independently.

    I was expecting the compiler to generate code more akin to the following (not tested but most of the way there):

    .global _i16tof32_v_hand_optimised
    .sect text
    .clink
    _i16tof32_v_hand_optimised:

    TBIT @AL, #0

    SB skip_1, NTC
    I16TOF32 R0H, *XAR5++
    LSR AL, #1
    MOV32 *XAR4++, R0H

    skip_1;
    TBIT @AL, #0
    SB init_loop, NTC
    I16TOF32 R0H, *XAR5++
    I16TOF32 R1H, *XAR5++
    MOV32 *XAR4++, R0H
    MOV32 *XAR4++, R1H

    init_loop:
    LSR AL, #1
    SB end_loop, EQ

    ADDB AL, #-1
    RPTB end_loop, @AL

    I16TOF32 R0H, *XAR5++
    I16TOF32 R1H, *XAR5++
    MOV32 *XAR4++, R0H
    MOV32 *XAR4++, R1H
    I16TOF32 R0H, *XAR5++
    I16TOF32 R1H, *XAR5++
    MOV32 *XAR4++, R0H
    MOV32 *XAR4++, R1H
    end_loop:

    LRETR

  • Thank you for the test case.  This is a performance bug.  I've submitted CODEGEN-2179 to track this issue.

    With the problem fixed, the compiler will generate:

    _i16tof32:
            CMPB      AL,#0                
            B         $C$L3,EQ             
            ; branchcc occurs
            AND       AH,AL,#0x0001        
            CMPB      AL,#2                
            B         $C$L2,LT             
            ; branchcc occurs
            ASR       AL,1                 
            ADDB      AL,#-1               
            MOVZ      AR6,AL               
            RPTB      $C$L2,AR6            
            ; repeat block starts
    $C$L1:    
            I16TOF32  R1H,*XAR5++          
            I16TOF32  R0H,*XAR5++          
            MOV32     *XAR4++,R1H          
            MOV32     *XAR4++,R0H          
            NOP      
            ; repeat block ends
    $C$L2:    
            CMPB      AH,#0                
            B         $C$L3,LEQ            
            ; branchcc occurs
    ; Peeled loop iterations for unrolled loop:
            I16TOF32  R0H,*+XAR5[0]        
            NOP      
            MOV32     *+XAR4[0],R0H        
    $C$L3:    
            LRETR    
            ; return occurs