AM62A7: How can we verify that the Advanced SIMD instructions are actually used / disassembly doesn't seem to show relevant opcodes

Stefan Birkholz

Part Number: AM62A7

Tool/software:

Hello,

we are trying to ascertain that our Tensorflow-Delegates (using Arago-Linux with EdgeAI Toolkits; default build options for OpenEmbedded as provided by TI) actually use the "Advanced SIMD" instructions by compiling the delegates (e.g. "libtidl-tfl-delegate.so"), verifying its general usability and subsequent disassembly using

objdump --disassemble-all libtidl_tfl_delegate.so

We were expecting to see opcodes similar to those described here: ARM Cortex-A (ARMv7-A) Series Programmer's Guide
However, we didn't find any in the disassembly!

We then tried another program similar to:

#include "stdio.h"
#include "arm_neon.h"

int main
{
    float a[4] = {1.0, 2.0, 3.0, 4.0};
    float b[4] = {5.0, 6.0, 7.0, 8.0};
    float res[4];
    
    float32x4_t a_vec = vld1q_f32(a);
    float32x4_t b_vec = vld1q_f32(b);
    float32x4_t res_vec = vaddq_f32(a_vec, b_vec);
    
    vst1q_f32(result, res_vec);
    
    return 0;
}

We compiled this on the target (with added "-march=armv8-a+simd" and without), ascertained that the program works correctly; however, in the disassmblies no "v.add" opcode seems to have been used (see attached file for disassembly of relevant function).

How can we verify that the vector instructions are actually in use?

Kind regards,

Stefan Birkholz

Fullscreen arm-neon-disassembly.txt Download

0000000000400698 <main>:
#include <stdio.h>
#include <arm_neon.h>
int main() {
  400698:       a9b37bfd        stp     x29, x30, [sp, #-208]!
  40069c:       910003fd        mov     x29, sp
    // Initialize two float arrays
    float a[4] = {1.0, 2.0, 3.0, 4.0};
  4006a0:       1e2e1000        fmov    s0, #1.000000000000000000e+00
  4006a4:       bd0033e0        str     s0, [sp, #48]
  4006a8:       1e201000        fmov    s0, #2.000000000000000000e+00
  4006ac:       bd0037e0        str     s0, [sp, #52]
  4006b0:       1e211000        fmov    s0, #3.000000000000000000e+00
  4006b4:       bd003be0        str     s0, [sp, #56]
  4006b8:       1e221000        fmov    s0, #4.000000000000000000e+00
  4006bc:       bd003fe0        str     s0, [sp, #60]
    float b[4] = {5.0, 6.0, 7.0, 8.0};
  4006c0:       1e229000        fmov    s0, #5.000000000000000000e+00
  4006c4:       bd0023e0        str     s0, [sp, #32]
  4006c8:       1e231000        fmov    s0, #6.000000000000000000e+00
  4006cc:       bd0027e0        str     s0, [sp, #36]
  4006d0:       1e239000        fmov    s0, #7.000000000000000000e+00
  4006d4:       bd002be0        str     s0, [sp, #40]
  4006d8:       1e241000        fmov    s0, #8.000000000000000000e+00
  4006dc:       bd002fe0        str     s0, [sp, #44]
  4006e0:       9100c3e0        add     x0, sp, #0x30
  4006e4:       f90023e0        str     x0, [sp, #64]
objdump: Warning: source file /usr/lib/gcc/aarch64-oe-linux/13.3.0/include/arm_neon.h is more recent than object file

__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32 (const float32_t *__a)
{
  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a);
  4006e8:       f94023e0        ldr     x0, [sp, #64]
  4006ec:       3dc00000        ldr     q0, [x0]
    float result[4];

    // Load arrays into NEON registers
    float32x4_t a_vec = vld1q_f32(a);
  4006f0:       3d802fe0        str     q0, [sp, #176]
  4006f4:       910083e0        add     x0, sp, #0x20
  4006f8:       f90027e0        str     x0, [sp, #72]
  4006fc:       f94027e0        ldr     x0, [sp, #72]
  400700:       3dc00000        ldr     q0, [x0]
    float32x4_t b_vec = vld1q_f32(b);
  400704:       3d802be0        str     q0, [sp, #160]
  400708:       3dc02fe0        ldr     q0, [sp, #176]
  40070c:       3d801be0        str     q0, [sp, #96]
  400710:       3dc02be0        ldr     q0, [sp, #160]
  400714:       3d8017e0        str     q0, [sp, #80]
  return __a + __b;
  400718:       3dc01be1        ldr     q1, [sp, #96]
  40071c:       3dc017e0        ldr     q0, [sp, #80]
  400720:       4e20d420        fadd    v0.4s, v1.4s, v0.4s

    // Perform vector addition
    float32x4_t res_vec = vaddq_f32(a_vec, b_vec);
  400724:       3d8027e0        str     q0, [sp, #144]
  400728:       910043e0        add     x0, sp, #0x10
  40072c:       f90047e0        str     x0, [sp, #136]
  400730:       3dc027e0        ldr     q0, [sp, #144]
  400734:       3d801fe0        str     q0, [sp, #112]

__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32 (float32_t *__a, float32x4_t __b)
{
  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) __a, __b);
  400738:       3dc01fe0        ldr     q0, [sp, #112]
  40073c:       f94047e0        ldr     x0, [sp, #136]
  400740:       3d800000        str     q0, [x0]
}
  400744:       d503201f        nop

    // Store the result back to memory
    vst1q_f32(result, res_vec);

    return 0;
  40078c:       52800000        mov     w0, #0x0                        // #0
}
  400790:       a8cd7bfd        ldp     x29, x30, [sp], #208
  400794:       d65f03c0        ret

6 months ago

+1 Pekka Varis 6 months ago

TI__Mastermind 27050 points

Looks like you are trying out using Arm intrinsics, and not getting them used. I'd suggest to look at https://arm-software.github.io/acle/neon_intrinsics/advsimd.html. On quick glance at that

vld1q_f32 should map to LD1 {Vt.4S},[Xn]

And I agree your disassembly is not mapping to this, but falling back to portable code. Could be some other compiler flag or setting involved. There is nothing TI specific, we use a standard A53 core that is Armv8.0 compliant with all options turned on. I'm guessing you use gcc compiler? What intrinsics are supported is in https://gcc.gnu.org/onlinedocs/gcc/ARM-C-Language-Extensions-_0028ACLE_0029.html .

Probably what is going on is the arrays (a, b, res) are not forced to be aligned to 128-bit vectors. So the compiler decides to use normal instructions just because of the alignement. I think you have to use vector data types as well, float32x4_t instead of just float or the intrinsic won't map to the Advanced SIMD instruction. This is clear in the ACLE documentation, but for logical correctness it will not fail to compile, just falls back to non Advanced SIMD. I have not tried out your example modified this way, but this looks highly likely to be the reason.

Generally I would suggest to look at GCC, Arm (arm.com and github), and then perhaps stackoverflow for guidance on standard Arm and open source tools.

Specifically on TFLite delegates, we already include XNNPACK (Google Arm optimized neural network library, https://github.com/google/XNNPACK ) in /usr/lib/tflite_2.12/xnnpack-build . That or armnn (https://github.com/ARM-software/armnn ) probably already includes extremely well optimized version of the delegate you are thinking of.

Stefan Birkholz said:
We were expecting to see opcodes similar to those described here: ARM Cortex-A (ARMv7-A) Series Programmer's Guide

Note the Cortex-A53 core is a Armv8.0 architecture so the vector extensions are https://developer.arm.com/documentation/dui0801/l/Overview-of-the-Armv8-Architecture/Advanced-SIMD . Armv7-A NEON refers to the generation of vector extensions in Cortex-A15 and other 32-bit cores from a decade or so ago. In the SW tooling like gcc the term NEON is still used, but more accurate is Advanced SIMD for A53 generation.

Pekka

0 Stefan Birkholz 5 months ago in reply to Pekka Varis

Prodigy 30 points

Thank you for your input - we could resolve the issue when we found this (seemingly "official") document:

ARM Instruction Set Overview

according to which on ARMv8 architectures the operations (like "add") are overloaded for scalar and vector operands, hence the usage of the Advanced SIMD facilities is implied by the vector register (see e.g. line 55 in the disassembly above). We found similar instructions in our Tensorflow setup (using both the tidl-delegate and XNNPACK) and this resolves our issue.

For completeness' sake: We compiled the binaries with the "-march=armv8-a+simd" switch, but this seems superfluous.

Stefan

Processors

Processors forum

AM62A7: How can we verify that the Advanced SIMD instructions are actually used / disassembly doesn't seem to show relevant opcodes