Part Number: AM263P4-Q1
Other Parts Discussed in Thread: AM263P4
Tool/software:
Given, the llvm 4.0.4 compiler for Cortex-R5F in the AM263p4 (ti/ccs2031/ccs/tools/compiler/ti-cgt-armllvm_4.0.4.LTS/bin)
And given this simple code:
typedef struct zDomainCoefficients {
float k[3];
float j[3];
} zDomainCoefficients;
typedef struct biQuadStates {
float x_n1;
float x_n2;
float y_n1;
float y_n2;
zDomainCoefficients coeffs;
} biQuadStates;
typedef struct biQuadDef {
float input;
float output;
biQuadStates state;
} biQuadDef;
void biquad_calc(biQuadDef *const biquad)
{
biquad->output = biquad->state.coeffs.k[0] * biquad->input +
biquad->state.coeffs.k[1] * biquad->state.x_n1 +
biquad->state.coeffs.k[2] * biquad->state.x_n2 -
biquad->state.coeffs.j[1] * biquad->state.y_n1 -
biquad->state.coeffs.j[2] * biquad->state.y_n2;
biquad->state.x_n2 = biquad->state.x_n1;
biquad->state.x_n1 = biquad->input;
biquad->state.y_n2 = biquad->state.y_n1;
biquad->state.y_n1 = biquad->output;
}
Compiled with this command:
tiarmclang -c -g -mcpu=cortex-r5 -mfloat-abi=hard -mfpu=vfpv3-d16 -mlittle-endian -mthumb -Ofast -ffast-math biquad_test.c -o biquad_test.o
I would expect to see VMLA.F32 instructions, but instead see this:
$ tiarmobjdump -S biquad_test.o
biquad_test.o: file format elf32-littlearm
Disassembly of section .text.biquad_calc:
00000000 <biquad_calc>:
; biquad->state.coeffs.k[1] * biquad->state.x_n1 +
0: ed90 0a02 vldr s0, [r0, #8]
; biquad->output = biquad->state.coeffs.k[0] * biquad->input +
4: ed90 4a06 vldr s8, [r0, #24]
; biquad->state.coeffs.k[1] * biquad->state.x_n1 +
8: ed90 5a07 vldr s10, [r0, #28]
; biquad->output = biquad->state.coeffs.k[0] * biquad->input +
c: ed90 6a00 vldr s12, [r0]
; biquad->state.coeffs.k[2] * biquad->state.x_n2 -
10: ed90 1a03 vldr s2, [r0, #12]
; biquad->state.coeffs.j[1] * biquad->state.y_n1 -
14: ed90 2a04 vldr s4, [r0, #16]
; biquad->state.coeffs.j[2] * biquad->state.y_n2;
18: ed90 3a05 vldr s6, [r0, #20]
; biquad->state.coeffs.k[1] * biquad->state.x_n1 +
1c: ee20 5a05 vmul.f32 s10, s0, s10
; biquad->state.coeffs.k[2] * biquad->state.x_n2 -
20: ed90 7a08 vldr s14, [r0, #32]
; biquad->output = biquad->state.coeffs.k[0] * biquad->input +
24: ee26 4a04 vmul.f32 s8, s12, s8
; biquad->state.coeffs.j[1] * biquad->state.y_n1 -
28: edd0 0a0a vldr s1, [r0, #40]
; biquad->state.coeffs.j[2] * biquad->state.y_n2;
2c: edd0 1a0b vldr s3, [r0, #44]
; biquad->output = biquad->state.coeffs.k[0] * biquad->input +
30: ee35 4a04 vadd.f32 s8, s10, s8
; biquad->state.x_n1 = biquad->input;
34: ed80 6a02 vstr s12, [r0, #8]
; biquad->state.coeffs.k[2] * biquad->state.x_n2 -
38: ee21 1a07 vmul.f32 s2, s2, s14
; biquad->state.x_n2 = biquad->state.x_n1;
3c: ed80 0a03 vstr s0, [r0, #12]
40: ee22 7a20 vmul.f32 s14, s4, s1
; biquad->state.y_n2 = biquad->state.y_n1;
44: ed80 2a05 vstr s4, [r0, #20]
48: ee23 3a21 vmul.f32 s6, s6, s3
; biquad->state.coeffs.k[1] * biquad->state.x_n1 +
4c: ee34 1a01 vadd.f32 s2, s8, s2
50: ee33 3a07 vadd.f32 s6, s6, s14
; biquad->state.coeffs.j[1] * biquad->state.y_n1 -
54: ee31 1a43 vsub.f32 s2, s2, s6
; biquad->output = biquad->state.coeffs.k[0] * biquad->input +
58: ed80 1a01 vstr s2, [r0, #4]
; biquad->state.y_n1 = biquad->output;
5c: ed80 1a04 vstr s2, [r0, #16]
; }
60: 4770 bx lr
Any thoughts on why VMLA.F32 instructions aren't used? This is 24 instructions for a 2nd order filter, is this the generally accepted performance?
thanks,
Paul