Tool/software:
Hello,
we are trying to ascertain that our Tensorflow-Delegates (using Arago-Linux with EdgeAI Toolkits; default build options for OpenEmbedded as provided by TI) actually use the "Advanced SIMD" instructions by compiling the delegates (e.g. "libtidl-tfl-delegate.so"), verifying its general usability and subsequent disassembly using
objdump --disassemble-all libtidl_tfl_delegate.so
We were expecting to see opcodes similar to those described here: ARM Cortex-A (ARMv7-A) Series Programmer's Guide
However, we didn't find any in the disassembly!
We then tried another program similar to:
#include "stdio.h" #include "arm_neon.h" int main { float a[4] = {1.0, 2.0, 3.0, 4.0}; float b[4] = {5.0, 6.0, 7.0, 8.0}; float res[4]; float32x4_t a_vec = vld1q_f32(a); float32x4_t b_vec = vld1q_f32(b); float32x4_t res_vec = vaddq_f32(a_vec, b_vec); vst1q_f32(result, res_vec); return 0; }
We compiled this on the target (with added "-march=armv8-a+simd" and without), ascertained that the program works correctly; however, in the disassmblies no "v.add" opcode seems to have been used (see attached file for disassembly of relevant function).
How can we verify that the vector instructions are actually in use?
Kind regards,
Stefan Birkholz
0000000000400698 <main>: #include <stdio.h> #include <arm_neon.h> int main() { 400698: a9b37bfd stp x29, x30, [sp, #-208]! 40069c: 910003fd mov x29, sp // Initialize two float arrays float a[4] = {1.0, 2.0, 3.0, 4.0}; 4006a0: 1e2e1000 fmov s0, #1.000000000000000000e+00 4006a4: bd0033e0 str s0, [sp, #48] 4006a8: 1e201000 fmov s0, #2.000000000000000000e+00 4006ac: bd0037e0 str s0, [sp, #52] 4006b0: 1e211000 fmov s0, #3.000000000000000000e+00 4006b4: bd003be0 str s0, [sp, #56] 4006b8: 1e221000 fmov s0, #4.000000000000000000e+00 4006bc: bd003fe0 str s0, [sp, #60] float b[4] = {5.0, 6.0, 7.0, 8.0}; 4006c0: 1e229000 fmov s0, #5.000000000000000000e+00 4006c4: bd0023e0 str s0, [sp, #32] 4006c8: 1e231000 fmov s0, #6.000000000000000000e+00 4006cc: bd0027e0 str s0, [sp, #36] 4006d0: 1e239000 fmov s0, #7.000000000000000000e+00 4006d4: bd002be0 str s0, [sp, #40] 4006d8: 1e241000 fmov s0, #8.000000000000000000e+00 4006dc: bd002fe0 str s0, [sp, #44] 4006e0: 9100c3e0 add x0, sp, #0x30 4006e4: f90023e0 str x0, [sp, #64] objdump: Warning: source file /usr/lib/gcc/aarch64-oe-linux/13.3.0/include/arm_neon.h is more recent than object file __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1q_f32 (const float32_t *__a) { return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a); 4006e8: f94023e0 ldr x0, [sp, #64] 4006ec: 3dc00000 ldr q0, [x0] float result[4]; // Load arrays into NEON registers float32x4_t a_vec = vld1q_f32(a); 4006f0: 3d802fe0 str q0, [sp, #176] 4006f4: 910083e0 add x0, sp, #0x20 4006f8: f90027e0 str x0, [sp, #72] 4006fc: f94027e0 ldr x0, [sp, #72] 400700: 3dc00000 ldr q0, [x0] float32x4_t b_vec = vld1q_f32(b); 400704: 3d802be0 str q0, [sp, #160] 400708: 3dc02fe0 ldr q0, [sp, #176] 40070c: 3d801be0 str q0, [sp, #96] 400710: 3dc02be0 ldr q0, [sp, #160] 400714: 3d8017e0 str q0, [sp, #80] return __a + __b; 400718: 3dc01be1 ldr q1, [sp, #96] 40071c: 3dc017e0 ldr q0, [sp, #80] 400720: 4e20d420 fadd v0.4s, v1.4s, v0.4s // Perform vector addition float32x4_t res_vec = vaddq_f32(a_vec, b_vec); 400724: 3d8027e0 str q0, [sp, #144] 400728: 910043e0 add x0, sp, #0x10 40072c: f90047e0 str x0, [sp, #136] 400730: 3dc027e0 ldr q0, [sp, #144] 400734: 3d801fe0 str q0, [sp, #112] __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32 (float32_t *__a, float32x4_t __b) { __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) __a, __b); 400738: 3dc01fe0 ldr q0, [sp, #112] 40073c: f94047e0 ldr x0, [sp, #136] 400740: 3d800000 str q0, [x0] } 400744: d503201f nop // Store the result back to memory vst1q_f32(result, res_vec); return 0; 40078c: 52800000 mov w0, #0x0 // #0 } 400790: a8cd7bfd ldp x29, x30, [sp], #208 400794: d65f03c0 ret