Tool/software: TI C/C++ Compiler
Hi All,
I'd like to improve some vector-matrix operations on a TMDSEVM6678LE via intrinsic instructions (SSE). I have done it via Intel-best instructions for GCC. as you can see below; however, our main purpose is to apply SSE for a TMDSEVM6678LE.
Please let me know if you are aware of a document/tutorial.
I wasn't able to find any. I only could find the two below:
http://www.ti.com/lit/an/spraa14/spraa14.pdf
http://www.ti.com/lit/an/spra666/spra666.pdf?ts=1591768687912
Regards
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <assert.h> #include <pmmintrin.h> #ifndef __SSE3__ #error This example requires SSE3 #endif #include "util.h" /* Size of the matrices to multiply */ #define SIZE 256 #define ITERS 100 #define XMM_ALIGNMENT_BYTES 16 static float mat_a[SIZE][SIZE] __attribute__((aligned (XMM_ALIGNMENT_BYTES))); static float mat_b[SIZE][SIZE] __attribute__((aligned (XMM_ALIGNMENT_BYTES))); static float mat_c[SIZE][SIZE] __attribute__((aligned (XMM_ALIGNMENT_BYTES))); static float mat_ref[SIZE][SIZE] __attribute__((aligned (XMM_ALIGNMENT_BYTES))); /** * Matrix multiplication. This is the procedure you should try to * optimize. */ static void matmul_sse() { for (i = 0; i < SIZE; i++) { for (j = 0; j < SIZE; j++) { float c_sum_float[4] = {0, 0, 0, 0}; __m128 c_sum = _mm_set_ps1(0.f); for (k = 0; k < SIZE; k+=4) { // load const __m128 a = _mm_load_ps(&(mat_a[i*SIZE+k])); const __m128 b = _mm_load_ps(&(mat_b[k*SIZE+j])); // mul const __m128 c = _mm_mul_ps(a, b); // sum & add c_sum = a + b; } _mm_storeu_ps(&c_sum_float[0], c_sum); mat_c[i][j] = c_sum_float[0] + c_sum_float[1] + c_sum_float[2] + c_sum_float[3]; } } }
Ple