Compiler/TMS320F28069: Slow matrix calculations

Simon Lund29

Part Number: TMS320F28069
Other Parts Discussed in Thread: CONTROLSUITE

Tool/software: TI C/C++ Compiler

Hi,

I have programmed a Kalman Filter using floating points. The filter is used in a controller context and implemented with a 3. order system. The problem is that the filter use around 27000 instructions pr. run, meaning my control loop is significantly decreased to around 2kHz (would prefer around 20kHz).

This is my Kalman Filter script:

/*
 * Kalman.c
 *
 *  Created on: Sep 10, 2018
 *      Author: simon
 */

#include "Kalman.h"
#include "matrix_calc.h"
#include "../IO/SCI.h"
#include <stdio.h>

void predictStep(kalman_t *kalman, input_vector in_old);
states updateStep(kalman_t *kalman, measurement_vector z_measure);

states state = {0,0,0};
float x[N][M] = {{10,3,4}, {3,1,2}, {4,2,6}};
float y[N][M] = {{1,2,1}, {2,1,2}, {1,2,6}};
float z[N][M];
char str2[30];
size_t n = N;
size_t m = M;
size_t p = P;

void kalman_init(kalman_t *kalman, input_vector init_uk, states init_x, float (*init_p)[N])
{
    kalman->xk_old[0][0]= init_x.Torque;
    kalman->xk_old[1][0]= init_x.Field;
    kalman->xk_old[2][0]= init_x.Speed;

    int i,j;

    for(i=0;i<M;i++) {
        for(j=0;j<N;j++) {
            kalman->Pk_old[i][j] = init_p[i][j];
       }
    }

    kalman->A[0][0]  = 1;
    kalman->A[0][1]  = 0.1;
    kalman->A[0][2]  = 0.1;
    kalman->A[1][0]  = 0;
    kalman->A[1][1]  = 1;
    kalman->A[1][2]  = 1;
    kalman->A[2][0]  = 0;
    kalman->A[2][1]  = 1;
    kalman->A[2][2]  = 1;

    kalman->B[0][0]  = 1;
    kalman->B[0][1]  = 0.1;
    kalman->B[1][0]  = 0;
    kalman->B[1][1]  = 1;
    kalman->B[2][0]  = 0;
    kalman->B[2][1]  = 1;

    kalman->Rk[0][0]  = 1;
    kalman->Rk[0][1]  = 0;
    kalman->Rk[0][2]  = 0;
    kalman->Rk[1][0]  = 0;
    kalman->Rk[1][1]  = 1;
    kalman->Rk[1][2]  = 0;
    kalman->Rk[2][0]  = 0;
    kalman->Rk[2][1]  = 0;
    kalman->Rk[2][2]  = 1;

    kalman->Qk[0][0]  = 1;
    kalman->Qk[0][1]  = 0;
    kalman->Qk[0][2]  = 0;
    kalman->Qk[1][0]  = 0;
    kalman->Qk[1][1]  = 1;
    kalman->Qk[1][2]  = 0;
    kalman->Qk[2][0]  = 0;
    kalman->Qk[2][1]  = 0;
    kalman->Qk[2][2]  = 1;

    kalman->H[0][0]  = 1;
    kalman->H[0][1]  = 0.1;
    kalman->H[0][2]  = 0.1;
    kalman->H[1][0]  = 0;
    kalman->H[1][1]  = 1;
    kalman->H[1][2]  = 1;
    kalman->H[2][0]  = 0;
    kalman->H[2][1]  = 1;
    kalman->H[2][2]  = 1;

    matrix_transpose(n, n, kalman->A,kalman->AT);
    matrix_transpose(n, n, kalman->H,kalman->HT);
    matrix_identity(n, kalman->I);

    predictStep(kalman, init_uk);
}

states KalmanFilter(kalman_t *kalman, input_vector uk_old, measurement_vector zk, Uint8 event) {

    switch(event) {

        case NEW_OBSERVATION:
        {
            state = updateStep(kalman, zk);
            break;
        }

        case NEW_SAMPLE:
        {
            predictStep(kalman, uk_old);
            break;
        }

    }
    return state;
}

void predictStep(kalman_t *kalman, input_vector in_old) {
    kalman->uk_old[0][0] = in_old.vq;
    kalman->uk_old[1][0] = in_old.vd;

    //xk = A*xk_old + B*uk_old
    matrix_multiply(n, n, 1, kalman->A, kalman->xk_old, kalman->Axk_old);
    matrix_multiply(n, p, 1, kalman->B, kalman->uk_old, kalman->Buk_old);
    matrix_addition(n, 1, kalman->Axk_old,kalman->Buk_old,kalman->xk);

    //zk = H*x
    matrix_multiply(n, n, 1, kalman->H, kalman->xk, kalman->zk);

    //Pk = A*Pk_old*A^T + Qk
    matrix_multiply(n, n, n, kalman->A, kalman->Pk_old, kalman->APk_old);
    matrix_multiply(n, n, n, kalman->APk_old, kalman->AT, kalman->APk_oldAT);
    matrix_addition(n, n, kalman->APk_oldAT, kalman->Qk, kalman->Pk);
}

states updateStep(kalman_t *kalman, measurement_vector zk) {
    states state;

    //Load input struct into Kalman parameter
    kalman->zk_meas[0][0] = zk.id;
    kalman->zk_meas[1][0] = zk.iq;
    kalman->zk_meas[2][0] = zk.wt;

    //Kk = Pk*H^T*[H*Pk*H^T+Rk]^-1
    matrix_multiply(n, n, n, kalman->Pk, kalman->HT, kalman->PkHT);
    matrix_multiply(n, n, n, kalman->H, kalman->PkHT, kalman->HPkHT);
    matrix_addition(n, n, kalman->HPkHT, kalman->Qk, kalman->HPkHT_Rk);
    matrix_inverse(n, kalman->HPkHT_Rk, kalman->I_HPkHT_Rk);
    matrix_multiply(n, n, n, kalman->PkHT, kalman->I_HPkHT_Rk, kalman->Kk);

    //xk = xk + Kk(zk_meas-zk)
    matrix_subtraction(m, 1, kalman->zk_meas, kalman->zk, kalman->meas_error);
    matrix_multiply(n, m, 1, kalman->Kk, kalman->meas_error, kalman->Kkmeas_error);
    matrix_addition(n, 1, kalman->xk, kalman->Kkmeas_error, kalman->xk_old);

    //Pk = (I-Kk*H)*Pk
    matrix_multiply(n, n, n, kalman->Kk, kalman->H, kalman->KkH);
    matrix_subtraction(n, n, kalman->I, kalman->KkH, kalman->I_nKkH);
    matrix_multiply(n, n, n, kalman->I_nKkH, kalman->Pk, kalman->Pk_old);

    //Load states into output struct
    state.Torque= kalman->xk_old[0][0];
    state.Field = kalman->xk_old[1][0];
    state.Speed = kalman->xk_old[2][0];

    return state;
}

and this is my matrix calculation file:

/*
 * matrix_calc.c
 *
 *  Created on: Sep 10, 2018
 *      Author: simon
 */

#include "matrix_calc.h"


void matrix_multiply(size_t n, size_t m, size_t p, float mat1[n][m], float mat2[m][p], float res[n][p])
{
    int i, j, k;
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < p; j++)
        {
            res[i][j] = 0;
            for (k = 0; k < m; k++)
                res[i][j] += mat1[i][k]*mat2[k][j];
        }
    }
}

void matrix_transpose (size_t n, size_t m, float mat1[n][m], float res[m][n])
{
    int i,j;

    for(i=0;i<n;i++){
        for(j=0;j<m;j++){
            res[j][i] = mat1[i][j];
        }
     }
}

void matrix_addition (size_t n, size_t m, float mat1[n][m], float mat2[n][m], float res[n][m])
{
    int i,j;

    for(i=0;i<m;i++) {
        for(j=0;j<n;j++) {
            res[i][j] = mat1[i][j]+mat2[i][j];
       }
    }
}

void matrix_subtraction (size_t n, size_t m, float mat1[n][m], float mat2[n][m], float res[n][m])
{
    int i,j;

    for(i=0;i<m;i++) {
        for(j=0;j<n;j++) {
            res[i][j] = mat1[i][j]-mat2[i][j];
       }
    }
}

float matrix_determinant(size_t k, float mat[k][k])
{
    float determinant = 0;
    int i = 0;

    for(i = 0; i < k; i++) {
        determinant = determinant + (mat[0][i] * (mat[1][(i+1)%3] * mat[2][(i+2)%3] - mat[1][(i+2)%3] * mat[2][(i+1)%3]));
    }
    return determinant;
}

void matrix_inverse(size_t k, float mat[k][k], float res[k][k]) {

    int i,j;
    float determinant;
    determinant = matrix_determinant(k,mat);

    for(i = 0; i < k; i++){
        for(j = 0; j < k; j++) {
            res[i][j]=((mat[(j+1)%3][(i+1)%3] * mat[(j+2)%3][(i+2)%3]) - (mat[(j+1)%3][(i+2)%3] * mat[(j+2)%3][(i+1)%3]))/ determinant;
        }
    }
}

void matrix_identity(size_t k, float res[k][k]) {

    int i,j;

    for(i = 0; i < 3; i++){
        for(j = 0; j < 3; j++) {
            if (j == i) {
                res[i][j] = 1;
            }
            else if (j != i) {
                res[i][j] = 0;
            }
        }
    }
}

My Compiler flags:

-v28 -ml -mt --float_support=fpu32 --vcu_support=vcu0 --opt_for_speed=5 --fp_reassoc=off --fp_mode=strict --include_path="/home/simon/workspace_v8/Scarab_PCB_test" --include_path="/home/simon/controlSUITE/libs/dsp/FPU/v131/include" --include_path="/home/simon/controlSUITE/libs/app_libs/motor_control/math_blocks/v4.3" --include_path="/home/simon/controlSUITE/device_support/f2806x/v151/F2806x_common/include" --include_path="/home/simon/controlSUITE/device_support/f2806x/v151/F2806x_headers/include" --include_path="/home/simon/controlSUITE/libs/math/IQmath/v160/include" --include_path="/home/simon/controlSUITE/libs/math/FPUfastRTS/V100/include" --include_path="/home/simon/controlSUITE/device_support/f2806x/v151/MWare" --include_path="/opt/TI/ti-cgt-c2000_6.4.2/include" --advice:performance=all -g --diag_wrap=off --diag_warning=225 --display_error_number

And my Linker flags:

-v28 -ml -mt --float_support=fpu32 --vcu_support=vcu0 --opt_for_speed=5 --fp_reassoc=off --fp_mode=strict --advice:performance=all -g --diag_wrap=off --diag_warning=225 --display_error_number -z -m"Scarab_PCB_test.map" --stack_size=0x200 --warn_sections -i"/opt/TI/ti-cgt-c2000_6.4.2/lib" -i"/home/simon/controlSUITE/device_support/f2806x/v151/F2806x_headers/cmd" -i"/home/simon/controlSUITE/device_support/f2806x/v151/MWare/lib" -i"/home/simon/controlSUITE/device_support/f2806x/v151/MWare/driverlib/Debug" -i"/home/simon/controlSUITE/device_support/f2806x/v151/F2806x_common/lib" -i"/home/simon/controlSUITE/libs/math/IQmath/v160/lib" -i"/home/simon/controlSUITE/libs/math/FPUfastRTS/V100/lib" -i"/opt/TI/ti-cgt-c2000_6.4.2/include" --priority --reread_libs --diag_wrap=off --display_error_number --xml_link_info="Scarab_PCB_test_linkInfo.xml" --rom_model

Of course, the inverse matrix calculation is the worst one, but all the matrix functions basically use too many instrunctions.

Do anybody have suggestions for optimizing the code or the project configuration that could lead to a significant improve of the overall matrix calculation speed?

Br,

Simon

over 7 years ago

0 Todd Anderson78572 over 7 years ago

Guru 14640 points

Here is a reference that may help:

e2e.ti.com/.../519151

0 Simon Lund29 over 7 years ago in reply to Todd Anderson78572

Prodigy 95 points

Thank you for your reply.

I have looked through the FPU library in the "C28x Floating Point Unis DSP Library User's guide" which the answer in that question suggests.

However, it seems to me, that only the mpy_SP_RVxRV_2() function is applicable in my case and it must be vectors with length being both an even number and at least 4 which in my situation both are not the case.

All other mathematical matrix/vector operations that i require, like subtraction, addition, inversion (in that library) are all made for complex vectors which i also cannot use.

Am i wrong, or could you suggest another solution?

0 Richard Poley over 7 years ago in reply to Simon Lund29

TI__Mastermind 27200 points

Hi Simon,

The code may be functionally correct but there are a few reasons why it won't execute efficiently. One is that the matrix functions don't know the matrix dimensions before they are called, so the compiler allocates space on the stack when the call is made and all indexing is done using stack relative addressing mode. Also, indexing the elements using [][] means the compiler has to calculate the indices each time. It is much more efficient if you can index in using indirect addressing, especially for those matrix operations like addition subtraction, transpose,...etc where you are accessing elements in a sequential manner.

To do this, you could define a matrix as an array and use a structure to hold pointers to the matrix start and end addresses, together with the matrix dimensions, and an indexing pointer. That way you are only passing a structure into your function and none of that stack relative addressing needs to happen.

Another (less flexible) way is to operate with fixed matrix sizes only. In a Kalman filter you typically know your system dimensions in advance and they don't change, so you could create fixed size arrays and just pass the base addresses to the function.

Both the above will entail some work, but something you could try quickly: I don't know if you have the optimizer turned on (I only see opt_for_speed = 5), but if you crank it up to 2 or higher (-o compiler switch)it should make quite a big difference. The above concerns are still valid though.

Version 4.0 of the Digital Control Library will include matrix support and contains all these functions and many more. It won't release until next year but I have a prototype version running. Let me know if you are interested in trying it.

Regards,

Richard

0 Simon Lund29 over 7 years ago in reply to Richard Poley

Prodigy 95 points

Thank you for your reply.

It makes a lot of sense what you have written. I have tried multiplying one 2-by-2 with one 2-by-1 matrix using fixed dimensions in the function declaration and that already improved the calculation speed with around 20-25 percent, though i still use [][] for indexing.

I'll probably rewrite my code using either the fixed sizes or the indexing pointer as you mention. Only problem about fixed dimension is, that i guess i then need many more matrix functions - one for each matrix size combination. And in addition, i loss the flexibility as you wrote.

Regarding the opt_level: it is currently off. I have tried setting it to each level (from 0 to 4) and higher level than 2 really improved computation time. However, for some reason changing the opt_level from off to 3 make me draw much more current from the power supply to my motor controller PCB (0.6A instead of 0.13A) and my motor, that is using the ePWM (all phases are set to 50% duty cycle), started making high-frequency noise around 15kHz. Do you know this problem and how to fix it?

I'm certainly interested in your prototype of version 4.0. Can you please send it to me?

0 Richard Poley over 7 years ago in reply to Simon Lund29

TI__Mastermind 27200 points

Simon,
I sent you a connection request over e2e. If you accept that we can take this off-line.
Regards,
Richard

0 Simon Lund29 over 7 years ago in reply to Richard Poley

Prodigy 95 points

Sorry, i'm quite new here at e2e.
I cannot find any connection request.
Could you sent one again or maybe guide me in how to accept the request?

0 Richard Poley over 7 years ago in reply to Simon Lund29

TI__Mastermind 27200 points

Simon,

Don't worry - I've been using e2e for years and I can't find it either. I sent you a test message. In the very top right corner of your e2e web page you should see an icon for 'messages'. It should be there.

Regards,

Richard

0 Simon Lund29 over 7 years ago in reply to Richard Poley

Prodigy 95 points

I received the message - but received it five minutes ago.

Br,
Simon

C2000™︎ microcontrollers

C2000 microcontrollers forum

Compiler/TMS320F28069: Slow matrix calculations