Tool/software:
Hi Ti,
My goal is to check if we can reach the maximum MAC performance of 80GFLOPS, mentionned in the datsaheet (TDA4VM processors). In order to do so, I implemented a code using DSPLIB functions, which allow me to run a FIR algorithm on C7x. I mainly used the code from DSPLIB_fir_example.cpp to develop my code:
/**
* main.cpp
*/
#include "dsplib.h"
#include <stdint.h>
#include <iostream>
#include "signalFiltreInfloat.h"
#include "IndicesFiltreNyquistV2.h" //contient les coefficients du filtre de nyquist
#include <c7x.h>
#define CLOCK_PER_SEC 1000000000 //1GHz
using namespace std;
int main(void){
/* --- Vecteurs d entree - sortie du filtre FIR --- */
long nbSymboles = 131704;
/* Utilisation petits vecteurs */
/* x(t) : vecteur de complexe en entree, appele in(t) ici */
float* x = signalFiltreInfloat;
/* h(t) : vecteur coefficients filtre FIR */
float* h = IndicesFiltreNyquist;
/* y(t)*/
/* Conformement au benchmark de TI, on met y(t) de meme taille que x(t) */
// La fonction fir de DSPLIB impose d'initialise le tableau stockant les echantillons de sortie
float* y = (float*)malloc(nbSymboles * sizeof(float));
/* Tailles des @ du filtre FIR */
uint32_t dataSize = nbSymboles; //x(t) contient 8 elements
// in bytes
uint32_t dataPitchInSize = nbSymboles * 8; //taille x(t) en bytes, pour rappel pour notre architecture, 1 float = 4 octet et 1 Complexe = 8 octets
// in bytes
uint32_t dataPitchOutSize = nbSymboles * 8; //taille y(t) en bytes : 131704 * 8
uint32_t batchSize = 1;
uint32_t filterSize = 128; //correspond à l'ordre du filtre
// float data type
uint32_t shift = 1;
/* --- Donnees sur le filtre h(t) --- */
//ajout pour compenser erreur DSPLIB_bufParams1D_t
uint32_t dataPitchFilterSize = 129 * 8; //129 coefficients, de type Complexe, 1 Complexe = 8 floats
//ajout pour compenser erreur DSPLIB_bufParams1D_t. Initialisation d'un @ prop par TI
uint32_t filterPitch = 0; //cf fichier DSPLIB_fir_idat.c
/* --- Caracteristiques du kernel --- */
DSPLIB_STATUS status;
DSPLIB_fir_InitArgs kerInitArgs;
int32_t handleSize = DSPLIB_fir_getHandleSize(&kerInitArgs);
DSPLIB_kernelHandle handle = malloc(handleSize);
/* Types des buffers d entree et sortie */
DSPLIB_bufParams2D_t bufParamsIn, bufParamsOut;
// DSPLIB_bufParams1D_t bufParamsFilter; //modification car cree erreur
DSPLIB_bufParams2D_t bufParamsFilter;
/* Remplissages des buffers avec les valeurs des vecteurs */
/* Buffer contenant x(t) */
bufParamsIn.data_type = DSPLIB_FLOAT32;
bufParamsIn.dim_x = dataSize;
bufParamsIn.stride_y = dataPitchInSize;
bufParamsIn.dim_y = batchSize;
/* Buffer contenant y(t) */
bufParamsOut.data_type = DSPLIB_FLOAT32;
bufParamsOut.dim_x = dataSize;
bufParamsOut.stride_y = dataPitchOutSize;
bufParamsOut.dim_y = batchSize;
/* Buffer contenant h(t) */
bufParamsFilter.data_type = DSPLIB_FLOAT32;
bufParamsFilter.dim_x = filterSize;
bufParamsFilter.stride_y = dataPitchFilterSize;
bufParamsFilter.dim_y = batchSize;
/* Remplissage des champs de la structure des @ du kernel */
kerInitArgs.dataSize = dataSize;
kerInitArgs.batchSize = batchSize;
kerInitArgs.filterSize = filterSize;
kerInitArgs.shift = shift;
/*
* Optimized C implementation of the function for the MMA + C7x architecture
*/
kerInitArgs.funcStyle = DSPLIB_FUNCTION_OPTIMIZED; //indicateur d'optimisation
/* Statut initial de l algo */
status = DSPLIB_SUCCESS;
/* --- Initialisation de l'algo FIR */
if (status == DSPLIB_SUCCESS){
status = DSPLIB_fir_init_checkParams(handle, &bufParamsIn, &bufParamsOut, &bufParamsFilter, &kerInitArgs);
}
if (status == DSPLIB_SUCCESS){
status = DSPLIB_fir_init(handle, &bufParamsIn, &bufParamsFilter, &bufParamsOut, &kerInitArgs);
}
/* --- Check avant Execution de l'algo FIR --- */
if (status == DSPLIB_SUCCESS){
status = DSPLIB_fir_exec_checkParams(handle, x, h, y);
}
/* --- Execution de l'algo FIR --- */
int nb_ap_fir = 1;
/* Timer de start */
unsigned long start_timer = __TSC;
for(int s=0;s<nb_ap_fir;s++){
if (status == DSPLIB_SUCCESS){
status = DSPLIB_fir_exec(handle, x, h, y);
}
else{
cout << "Impossible d'appliquer l'algo FIR " << endl;
}
}
/* Timer de stop */
unsigned long stop_timer = __TSC;
/* Nombre de cycles CPU ecoules */
unsigned long nb_cycles = stop_timer - start_timer;
/* Temps ecoule en µs */
double tec = (double)(nb_cycles) / CLOCK_PER_SEC ;
cout << "Le temps écoulé lors de l'exécution algo FIR vaut : " << tec << " s" << endl;
long MAC = nb_ap_fir * (33979632/2); //hypothese faite
/* Recuperation temps d execution en MAC\µs */
double MAC_par_sec = (double) MAC / tec;
/* Affichage des perfomances */
cout << "Le temps d'exécution est de " << MAC_par_sec << " MAC/s " << endl;
return 0;
}
Then, I used the linker script lnk.cmd, provided by TI, located in the C7100 folder, at the path : ti-processor-sdk-rtos-j721e-evm-10_01_00_04\dsplib\cmake\linkers. However, whenever I try to run my project (in Release mode), I aways get a status different from DSPLIB_success, and therefore I never enter the DSPLIB_fir_exec() function...
Could someone solve this issue ?
Regards,
Mélanie