AM263P4-Q1: Cortex-R5F code not compiling using VMLA.F32

Paul Thomas

Part Number: AM263P4-Q1
Other Parts Discussed in Thread: AM263P4

Tool/software:

Given, the llvm 4.0.4 compiler for Cortex-R5F in the AM263p4 (ti/ccs2031/ccs/tools/compiler/ti-cgt-armllvm_4.0.4.LTS/bin)

And given this simple code:

typedef struct zDomainCoefficients {
    float k[3];
    float j[3];
} zDomainCoefficients;

typedef struct biQuadStates {
    float x_n1;
    float x_n2;
    float y_n1;
    float y_n2;
    zDomainCoefficients coeffs;
} biQuadStates;


typedef struct biQuadDef {
    float input;
    float output;
    biQuadStates state;
} biQuadDef;

void biquad_calc(biQuadDef *const biquad)
{
    biquad->output = biquad->state.coeffs.k[0] * biquad->input +
                     biquad->state.coeffs.k[1] * biquad->state.x_n1 +
                     biquad->state.coeffs.k[2] * biquad->state.x_n2 -
                     biquad->state.coeffs.j[1] * biquad->state.y_n1 -
                     biquad->state.coeffs.j[2] * biquad->state.y_n2;

    biquad->state.x_n2 = biquad->state.x_n1;
    biquad->state.x_n1 = biquad->input;
    biquad->state.y_n2 = biquad->state.y_n1;
    biquad->state.y_n1 = biquad->output;
}

Compiled with this command:

tiarmclang -c -g -mcpu=cortex-r5 -mfloat-abi=hard -mfpu=vfpv3-d16 -mlittle-endian -mthumb -Ofast -ffast-math biquad_test.c -o biquad_test.o

I would expect to see VMLA.F32 instructions, but instead see this:

$ tiarmobjdump -S biquad_test.o

biquad_test.o:  file format elf32-littlearm

Disassembly of section .text.biquad_calc:

00000000 <biquad_calc>:
;                      biquad->state.coeffs.k[1] * biquad->state.x_n1 +
       0: ed90 0a02     vldr    s0, [r0, #8]
;     biquad->output = biquad->state.coeffs.k[0] * biquad->input +
       4: ed90 4a06     vldr    s8, [r0, #24]
;                      biquad->state.coeffs.k[1] * biquad->state.x_n1 +
       8: ed90 5a07     vldr    s10, [r0, #28]
;     biquad->output = biquad->state.coeffs.k[0] * biquad->input +
       c: ed90 6a00     vldr    s12, [r0]
;                      biquad->state.coeffs.k[2] * biquad->state.x_n2 -
      10: ed90 1a03     vldr    s2, [r0, #12]
;                      biquad->state.coeffs.j[1] * biquad->state.y_n1 -
      14: ed90 2a04     vldr    s4, [r0, #16]
;                      biquad->state.coeffs.j[2] * biquad->state.y_n2;
      18: ed90 3a05     vldr    s6, [r0, #20]
;                      biquad->state.coeffs.k[1] * biquad->state.x_n1 +
      1c: ee20 5a05     vmul.f32        s10, s0, s10
;                      biquad->state.coeffs.k[2] * biquad->state.x_n2 -
      20: ed90 7a08     vldr    s14, [r0, #32]
;     biquad->output = biquad->state.coeffs.k[0] * biquad->input +
      24: ee26 4a04     vmul.f32        s8, s12, s8
;                      biquad->state.coeffs.j[1] * biquad->state.y_n1 -
      28: edd0 0a0a     vldr    s1, [r0, #40]
;                      biquad->state.coeffs.j[2] * biquad->state.y_n2;
      2c: edd0 1a0b     vldr    s3, [r0, #44]
;     biquad->output = biquad->state.coeffs.k[0] * biquad->input +
      30: ee35 4a04     vadd.f32        s8, s10, s8
;     biquad->state.x_n1 = biquad->input;
      34: ed80 6a02     vstr    s12, [r0, #8]
;                      biquad->state.coeffs.k[2] * biquad->state.x_n2 -
      38: ee21 1a07     vmul.f32        s2, s2, s14
;     biquad->state.x_n2 = biquad->state.x_n1;
      3c: ed80 0a03     vstr    s0, [r0, #12]
      40: ee22 7a20     vmul.f32        s14, s4, s1
;     biquad->state.y_n2 = biquad->state.y_n1;
      44: ed80 2a05     vstr    s4, [r0, #20]
      48: ee23 3a21     vmul.f32        s6, s6, s3
;                      biquad->state.coeffs.k[1] * biquad->state.x_n1 +
      4c: ee34 1a01     vadd.f32        s2, s8, s2
      50: ee33 3a07     vadd.f32        s6, s6, s14
;                      biquad->state.coeffs.j[1] * biquad->state.y_n1 -
      54: ee31 1a43     vsub.f32        s2, s2, s6
;     biquad->output = biquad->state.coeffs.k[0] * biquad->input +
      58: ed80 1a01     vstr    s2, [r0, #4]
;     biquad->state.y_n1 = biquad->output;
      5c: ed80 1a04     vstr    s2, [r0, #16]
; }
      60: 4770          bx      lr

Any thoughts on why VMLA.F32 instructions aren't used? This is 24 instructions for a 2nd order filter, is this the generally accepted performance?

thanks,

Paul

4 months ago

0 Ki 4 months ago

TI__Guru**** 476051 points

Hello,

I have brought this thread to the attention of the compiler experts. They should be responding shortly.

Thanks

0 George Mock 4 months ago in reply to Ki

TI__Guru**** 252520 points

Thank you for bringing this to our attention, and for supplying a concise test case. I am able to reproduce the same behavior. I filed the issue EXT_EP-12949 to have this investigated. You are welcome to follow it with that link.

Thanks and regards,

-George

Arm-based microcontrollers

Arm-based microcontrollers forum

AM263P4-Q1: Cortex-R5F code not compiling using VMLA.F32