Tool/software: TI C/C++ Compiler
I am attempting to write fast code in C rather than assembler and have stumbled across the compiler not emitting MOV32 RaH, RbH {, CNDF} or MOV32 RaH, mem32 {, CNDF} instructions when I expect it to.
Given the file test.c:
#include <math.h>
double test_double(double a, double b) {
return fabs(a) > fabs(b) ? a : b;
}
static inline double quick_two_sum(double a, double b, double *err) {
double s = a + b;
*err = b - (s - a);
return s;
}
void c_dd_add_dd_d(const double *a, double b, double * restrict c) {
double s1, s2;
if (fabs(a[0]) > fabs(b)) {
s1 = a[0];
s2 = b;
} else {
s1 = b;
s2 = a[0];
}
s1 = quick_two_sum(s1, s2, &s2);
s2 += a[1];
s1 = quick_two_sum(s1, s2, &s2);
c[0] = s1;
c[1] = s2;
}
and the command :
"C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_17.3.0.STS/bin/cl2000" -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O4 --opt_for_speed=5 --fp_mode=strict --fp_reassoc=off --include_path="C:/ti/ccsv6/tools/compiler/ti-cgt-c2000_17.3.0.STS/include" --symdebug:none --diag_warning=225 --diag_wrap=off --display_error_number --src_interlist --preproc_with_compile --preproc_dependency="test.d" "../test.c"
I end with the following assembly (trimmed for brevity):
_test_double:
;*** 4 ----------------------- return (ABS(a) > ABS(b)) ? a : b;
ADDB SP,#2 ; [CPU_U]
MOV32 R2H,R1H ; [CPU_] |4|
MOV32 R3H,R0H ; [CPU_] |4|
SUBB SP,#2 ; [CPU_U] |4|
ABSF32 R2H,R2H ; [CPU_] |4|
ABSF32 R3H,R3H ; [CPU_] |4|
CMPF32 R3H,R2H ; [CPU_] |4|
MOVST0 ZF, NF ; [CPU_] |4|
MOV32 R1H,R0H,GT ; [CPU_] |4|
MOV32 R0H,R1H ; [CPU_] |4|
LRETR ; [CPU_]
_c_dd_add_dd_d:
;*** 16 ----------------------- U$3 = *a;
;*** 16 ----------------------- if ( ABS(U$3) > ABS(b) ) goto g3;
ADDB SP,#2 ; [CPU_U]
MOVL ACC,*+XAR4[0] ; [CPU_] |16|
MOV32 R2H,ACC ; [CPU_] |16|
NOP ; [CPU_]
NOP ; [CPU_]
MOV32 R1H,R0H ; [CPU_] |16|
ABSF32 R1H,R1H ; [CPU_] |16|
ABSF32 R2H,R2H ; [CPU_] |16|
CMPF32 R2H,R1H ; [CPU_] |16|
MOVST0 ZF, NF ; [CPU_] |16|
B $C$L1,GT ; [CPU_] |16|
; branchcc occurs ; [] |16|
;*** 20 ----------------------- s1 = b;
;*** 21 ----------------------- s2 = U$3;
;*** 21 ----------------------- goto g4;
MOV32 R1H,R0H ; [CPU_] |20|
MOV32 R0H,ACC ; [CPU_] |21|
B $C$L2,UNC ; [CPU_] |21|
; branch occurs ; [] |21|
$C$L1:
;*** -----------------------g3:
;*** 17 ----------------------- s1 = U$3;
;*** 18 ----------------------- s2 = b;
MOV32 R1H,ACC ; [CPU_] |17|
NOP ; [CPU_]
NOP ; [CPU_]
NOP ; [CPU_]
NOP ; [CPU_]
$C$L2:
;*** -----------------------g4:
;*** 8 ----------------------- s = s1+s2; // [2]
;*** 9 ----------------------- s2 = s2-(s-s1)+a[1]; // [2]
;*** 8 ----------------------- s = s+s2; // [2]
;*** 9 ----------------------- s2 -= s-s; // [2]
;*** 28 ----------------------- *c = s;
;*** 29 ----------------------- c[1] = s2;
;*** ----------------------- return;
ADDF32 R2H,R0H,R1H ; [CPU_] |8|
NOP ; [CPU_]
SUBF32 R1H,R2H,R1H ; [CPU_] |9|
NOP ; [CPU_]
SUBF32 R0H,R0H,R1H ; [CPU_] |9|
MOV32 R3H,*+XAR4[2] ; [CPU_] |9|
ADDF32 R0H,R0H,R3H ; [CPU_] |9|
NOP ; [CPU_]
ADDF32 R1H,R0H,R2H ; [CPU_] |8|
NOP ; [CPU_]
SUBF32 R2H,R1H,R2H ; [CPU_] |9|
SUBB SP,#2 ; [CPU_U]
SUBF32 R0H,R0H,R2H ; [CPU_] |9|
MOV32 *+XAR5[0],R1H ; [CPU_] |28|
MOV32 *+XAR5[2],R0H ; [CPU_] |29|
LRETR ; [CPU_]
In _test_double a redundant move of floating-point status flags to integer status flags is performed, since the subsequent move uses the ZF and NF flags.
In _c_dd_add_dd_d a costly branch is used rather than conditional moves.
In _c_dd_add_dd_d there is a costly use of ACC to avoid pushing and popping a floating-point register.
Am I missing something here? Is there a reason conditional moves are not being used in this situation?
Why is ACC being used as a temporary register when it results in a cycle penalty (I am at optimisation level 4 and optimise for speed 5).