Tool/software:
To test the runtime of the modified iqmath function, I executed it in both TCMA and OCRAM.
My environment is CCS12.8,SDK 10.0.0.37 code optimization O-fast
To prevent caching of identical computation results, to prevent caching of identical computation results, I generated 100 random numbers for calculation. Modify USE_TCM to 0 or 1 to choose whether to allocate memory to TCM. Here is my test code:
#include <stdlib.h>
#include <math.h>
#include <kernel/dpl/DebugP.h>
#include "ti_drivers_config.h"
#include "ti_board_config.h"
#include "IQmathLib.h"
#include "mathlib/trig/ti_arm_trig.h"
uint32_t u32TimerStart,u32TimerLoad = 0;
float f32DataOut;
_iq iqDataOut;
float f32SinOut_assembly[100] = {0.0f};
_iq iqSinOut_assembly[100] = {0};
_iq iqSinOut[100] = {0};
float f32SqrtOut_assembly[100] = {0.0f};
_iq iqSqrtOut[100] = {0};
_iq iqSqrtOut_assembly[100] = {0};
_iq iqMulOut[100] = {0};
_iq iqMulOut_assembly[100] = {0};
_iq iqDivOut[100] = {0};
_iq iqDivOut_assebmly[100] = {0};
#define USE_TCM (0)
const float iq2fScale = 0.000000059604645f;
const float f2iqScale = 16777216.000000f;
const float pi2 = 6.28318530717959f;
float f32SinIn[100] = {3.712407,2.129162,5.611364,2.936816,4.674741,2.058908,4.290381,3.632543,3.796699,3.349105,
5.616087,5.240428,4.939541,3.788255,4.763273,2.698073,4.023251,1.531706,1.085435,0.777734,
3.323697,3.257371,3.88798,0.794623,5.438464,4.107997,2.541916,0.399682,5.709745,2.603319,
1.950818,2.565326,4.598627,5.130248,4.920356,4.647836,4.717129,2.899751,5.836289,2.917093,
4.305292,0.280102,4.348784,1.12106,5.377209,4.169042,4.815597,4.29549,1.702546,2.472285,
6.189092,4.855415,0.300805,1.388049,1.028099,1.881404,2.062716,5.260398,3.941642,4.493471,
6.282049,6.160821,5.930238,1.333015,1.343667,6.184921,4.436584,5.315681,5.676171,5.921862,
3.521913,3.320549,1.552201,2.712443,0.376286,3.635715,1.884349,0.299,2.111523,3.769371,
5.138324,0.649593,0.209134,5.092558,4.008929,4.508341,2.642054,3.749132,5.33098,1.087774,
0.691124,6.164273,2.357287,5.472303,2.197861,5.667922,3.463029,2.620582,2.690765,4.312372};
float f32SqrtIn[100] = {0.822207,0.966454,0.090616,0.788111,0.758306, 0.738356,0.535171,0.015758,0.042506,0.825861,
0.971483,0.975205,0.510375,0.36407,0.889855,0.646937,0.759267,0.143709,0.237777,0.557448,
0.593504,0.926576,0.204938,0.673273,0.98163,0.171916,0.745332,0.17287,0.131861,0.928158,
0.968507,0.998606,0.004652,0.270313,0.162947,0.251017,0.281967,0.416146,0.870192,0.406895,
0.554871,0.084685,0.169093,0.274182,0.897739,0.353732,0.468215,0.730876,0.421698,0.258417,
0.742467,0.138787,0.878365,0.837744,0.045111,0.926592,0.147535,0.312763,0.556087,0.552088,
0.698499,0.479554,0.66014,0.352169,0.720769,0.136011,0.925598,0.243663,0.08588,0.900161,
0.820419,0.191058,0.715925,0.431502,0.662533,0.294721,0.263202,0.331849,0.017807,0.782637,
0.449799,0.44025,0.351247,0.129914,0.490479,0.057155,0.673106,0.735046,0.91323,0.811223,
0.51295,0.657512,0.328113,0.386043,0.23684,0.636593,0.336395,0.463664,0.865202,0.683065};
_iq iqSinIn[100] = {7334100,14626481,5558979,16234385,8725661,14385339,766440,12523141,7860266,15542651,
3058682,3530925,835795,5314808,14040525,7786403,10551614,15150083,10332709,14607518,
5518319,3604083,6894421,13978931,15174124,10682735,1219195,10832689,5532140,14037653,
4911997,3658046,868993,14653156,10694315,12878234,13577655,15698950,2119409,2332973,
7977791,14565738,14066455,14586520,6719596,14170681,15345733,7350,7163648,16170356,
7602036,12090923,58194,15940078,16617123,14004457,9826022,5269094,13714629,11069671,
15910489,9096013,4339855,3857399,12516740,16637451,15557704,7368087,11601717,6268786,
2355206,7485576,10352497,8882846,1878697,10673338,11435463,15731345,10656716,1784939,
10150870,12403372,14052832,8038175,14558343,7754627,6368348,8329271,5593392,37961,
643991,3092709,9125990,8477608,4168482,1193493,849605,10194797,10043572,807661};
_iq iqSqrtIn[100] = {7334100,14626481,3558979,16234385,8725661,14385339,766440,12523141,7860266,15542651,
3058682,3530925,835795,5314808,14040525,7786403,10551614,15150083,10332709,14607518,
5518319,3604083,6894421,13978931,15174124,10682735,1219195,10832689,5532140,14037653,
4911997,3658046,868993,14653156,10694315,12878234,13577655,15698950,2119409,2332973,
7977791,14565738,14066455,14586520,6719596,14170681,15345733,7350,7163648,16170356,
7602036,12090923,58194,15940078,16617123,14004457,9826022,5269094,13714629,11069671,
15910489,9096013,4339855,3857399,12516740,16637451,15557704,7368087,11601717,6268786,
2355206,7485576,10352497,8882846,1878697,10673338,11435463,15731345,10656716,1784939,
10150870,12403372,14052832,8038175,14558343,7754627,6368348,8329271,5593392,37961,
643991,3092709,9125990,8477608,4168482,1193493,849605,10194797,10043572,807661};
_iq iqDivIn[100] = {16316244,15787933,15403241,11660262,4292734,9032192,10156213,7448221,10866686,14014676,
2452939,14082717,12075177,9806672,7991054,15986104,13088964,8940713,10375408,1346096,
8174168,8122380,974020,3198849,4218291,13775288,11396425,1263821,1258931,7047291,
13876626,5835055,6033813,9030130,4699562,11870098,14387270,8957787,4657818,1399007,
357767,438226,6696376,3528846,11882848,6023232,5521753,16317692,7982752,8484968,
665438,2786828,8893826,409544,110629,11880945,16759713,8739897,16023258,5975345,
5728985,10647452,11059773,10276916,15088830,10948835,4058958,3323378,6170826,16629831,
10360173,10079387,1982317,7093657,7846383,2783815,2542736,12446418,14328632,12115202,
1384528,11408496,907709,2625221,640234,1981499,15247266,13513537,1234216,13163128,
3786184,4005491,8754898,10964149,1142647,8962164,4370648,10053798,15926282,10122239
};
#if USE_TCM
__attribute__((__section__(".armiqmath"), noinline))_iq24 _IQ24mpy_test(_iq A, _iq B)
#else
static __attribute__((__section__(".armiqmath")))_iq24 _IQ24mpy_test(_iq A, _iq B)
#endif
{
_iq24 result;
__asm__ volatile(
"smull r2, r3, %1, %2 \n" // r2 = lower 32bit r3 = higher 32bit
"lsr r2, r2, #24 \n" // r2 >> 24
"orr r2, r2, r3, lsl #8 \n" // combine r3's 8bit high with r2's 8bit low
"mov %0, r2 \n" // save result
: "=r" (result)
: "r" (A), "r" (B)
: "r2", "r3", "cc"
);
return result;
}
#if USE_TCM
__attribute__((__section__(".armiqmath"), noinline))_iq24 _IQdiv_test(_iq iqDividend, _iq iqDivisor)
#else
static __attribute__((__section__(".armiqmath")))_iq24 _IQdiv_test(_iq iqDividend, _iq iqDivisor)
#endif
{
_iq24 result;
__asm__ volatile (
"cmp %2, #0 \n" // if iqDivisor == 0
"beq 1f \n" // jump to 1
"vmov s0, %1 \n" // s0 = iqDividend
"vmov s1, %2 \n" // s1 = iqDivisor
"vcvt.f32.s32 s0, s0 \n" // iqDividend to float-point
"vcvt.f32.s32 s1, s1 \n" // iqDivisor to float-point
"vdiv.f32 s2, s0, s1 \n" // s2 = s0/s1
"vmov.f32 s3, %3 \n" // s3 = f2iqScale
"vmul.f32 s2, s2, s3 \n" // f2iqScale fix-point
"vcvt.s32.f32 s2, s2 \n" // result to fix-point
"vmov %0, s2 \n" // save result
"b 2f \n" // skip 1
"1: \n"
"mov %0, #0 \n" // if divisor = 0, return 0
"2: \n"
: "=r" (result) // output result
: "r" (iqDividend), "r" (iqDivisor), "r" (f2iqScale)
: "s0", "s1", "s2", "s3", "cc", "memory"
);
return result;
}
#if USE_TCM
__attribute__((__section__(".armiqmath"), noinline))_iq24 _IQ24sin_test1(const _iq inputIq24Num)
#else
static __attribute__((__section__(".armiqmath")))_iq24 _IQ24sin_test1(const _iq inputIq24Num)
#endif
{
_iq iqSinResult;
float floatSinResult;
float inputFloatNum;
__asm__ volatile (
"vmov s0, %1 \n" // s0 = inputIq24Num
"vcvt.f32.s32 s0, s0 \n" // fix-point to float-point
"vmov.f32 s1, %2 \n" // s1 = iq2fScale
"vmul.f32 s0, s0, s1 \n" // s0 = s0*s1
"vmov.f32 s2, %3 \n" // s2 = 2pi
"vmul.f32 s0, s0, s2 \n" // s0 = s0*2pi, so input is range in (0,2pi)
"vmov %0, s0 \n" // save s0 to inputFloatNum
: "=r" (inputFloatNum)
: "r" (inputIq24Num), "r" (iq2fScale), "r" (pi2)
: "s0", "s1", "s2", "cc", "memory"
);
floatSinResult = ti_arm_sin(inputFloatNum); // call ti_arm_sin
__asm__ volatile (
"vmov s0, %1 \n" // s0 = floatSinResult
"vldr s1, [%2] \n" // s1 = f2iqScale
"vmul.f32 s0, s0, s1 \n" // s0 = s0*s1
"vcvt.s32.f32 s0, s0 \n" // float-point to fix-point
"vmov %0, s0 \n" // save result
: "=r" (iqSinResult)
: "r" (floatSinResult), "r" (&f2iqScale)
: "s0", "s1", "cc", "memory"
);
return iqSinResult;
}
#if USE_TCM
__attribute__((__section__(".armiqmath"), noinline))_iq24 _IQ24sqrt_test1(const _iq inputIq24Num)
#else
static __attribute__((__section__(".armiqmath")))_iq24 _IQ24sqrt_test1(const _iq inputIq24Num)
#endif
{
_iq24 iqSinResult;
float floatSinResult;
float inputFloatNum;
__asm__ volatile (
"vmov s0, %1 \n"
"vcvt.f32.s32 s0, s0 \n"
"vmov.f32 s1, %2 \n"
"vmul.f32 s0, s0, s1 \n"
"vmov %0, s0 \n"
: "=r" (inputFloatNum)
: "r" (inputIq24Num), "r" (iq2fScale)
: "s0", "s1", "cc", "memory"
);
floatSinResult = ti_arm_sqrt(inputFloatNum);
__asm__ volatile (
"vmov s0, %1 \n"
"vldr s1, [%2] \n"
"vmul.f32 s0, s0, s1 \n"
"vcvt.s32.f32 s0, s0 \n"
"vmov %0, s0 \n"
: "=r" (iqSinResult)
: "r" (floatSinResult), "r" (&f2iqScale)
: "s0", "s1", "cc", "memory"
);
return iqSinResult;
}
int mathLib_test_main(void)
{
System_init();
Board_init();
#if USE_TCM
DebugP_log("USE TCM\r\n");
#else
DebugP_log("NOT USE TCM\r\n");
#endif
DebugP_log("sine test\r\n");
//sin
u32TimerStart = ReadCpuTimer0Counter();
for(int i = 0;i<100;i++)
{
iqSinOut_assembly[i] = _IQ24sin_test1(iqSinIn[i]);
}
u32TimerLoad = ReadCpuTimer0Counter() - u32TimerStart;
DebugP_log("assembly IQsin:%d\r\n",u32TimerLoad);
DebugP_log("sqrt test\r\n");
//sqrt
u32TimerStart = ReadCpuTimer0Counter();
for(int i = 0;i<100;i++)
{
iqSqrtOut_assembly[i] = _IQ24sqrt_test1(iqSqrtIn[i]);
}
u32TimerLoad = ReadCpuTimer0Counter() - u32TimerStart;
DebugP_log("assembly sqrt:%d\r\n",u32TimerLoad);
DebugP_log("mult test\r\n");
//mul
u32TimerStart = ReadCpuTimer0Counter();
for(int i = 0;i<100;i++)
{
iqMulOut_assembly[i] = _IQ24mpy_test(iqSinIn[i], iqSqrtIn[i]);
}
u32TimerLoad = ReadCpuTimer0Counter() - u32TimerStart;
DebugP_log("assembly mpy:%d\r\n",u32TimerLoad);
DebugP_log("div test\r\n");
//div
u32TimerStart = ReadCpuTimer0Counter();
for(int i = 0;i<100;i++)
{
iqDivOut_assebmly[i] = _IQdiv_test(iqDivIn[i], iqSqrtIn[i]);
}
u32TimerLoad = ReadCpuTimer0Counter() - u32TimerStart;
DebugP_log("assebmly iq div:%d\r\n",u32TimerLoad);
while(1)
{};
}
besides, need to configure RTI to represent time, I configured RTI to 125count = 1us
uint32_t ReadCpuTimer0Counter(void)
{
uint32_t u32Timer = 0xFFFFFFFF - HW_RD_REG32(RTI_CPU_BASE_ADDR + CSL_RTI_RTIFRC0);
return u32Timer;
}
my cmd file:
/* This is the stack that is used by code running within main()
* In case of NORTOS,
* - This means all the code outside of ISR uses this stack
* In case of FreeRTOS
* - This means all the code until vTaskStartScheduler() is called in main()
* uses this stack.
* - After vTaskStartScheduler() each task created in FreeRTOS has its own stack
*/
--stack_size=16384
/* This is the heap size for malloc() API in NORTOS and FreeRTOS
* This is also the heap used by pvPortMalloc in FreeRTOS
*/
--heap_size=32768
-e_vectors /* This is the entry of the application, _vector MUST be placed starting address 0x0 */
/* This is the size of stack when R5 is in IRQ mode
* In NORTOS,
* - Here interrupt nesting is enabled
* - This is the stack used by ISRs registered as type IRQ
* In FreeRTOS,
* - Here interrupt nesting is enabled
* - This is stack that is used initally when a IRQ is received
* - But then the mode is switched to SVC mode and SVC stack is used for all user ISR callbacks
* - Hence in FreeRTOS, IRQ stack size is less and SVC stack size is more
*/
__IRQ_STACK_SIZE = 256;
/* This is the size of stack when R5 is in IRQ mode
* - In both NORTOS and FreeRTOS nesting is disabled for FIQ
*/
__FIQ_STACK_SIZE = 256;
__SVC_STACK_SIZE = 4096; /* This is the size of stack when R5 is in SVC mode */
__ABORT_STACK_SIZE = 256; /* This is the size of stack when R5 is in ABORT mode */
__UNDEFINED_STACK_SIZE = 256; /* This is the size of stack when R5 is in UNDEF mode */
SECTIONS
{
.vectors : { } > R5F_VECS , palign(8)
GROUP : {
.text.hwi : { } palign(8)
.text.cache : { } palign(8)
.text.mpu : { } palign(8)
.text.boot : { } palign(8)
.text:abort : { } palign(8)
} > OCRAM
GROUP : {
.text : { } palign(8)
.rodata : { } palign(8)
} > OCRAM
GROUP : {
.data : { } palign(8)
} > OCRAM
GROUP : {
.bss : { } palign(8)
RUN_START(__BSS_START)
RUN_END(__BSS_END)
.sysmem : { } palign(8)
.stack : { } palign(8)
.usbCxtRam : { } align(8)
} > OCRAM
GROUP : {
.irqstack : { . = . + __IRQ_STACK_SIZE; } align(8)
RUN_START(__IRQ_STACK_START)
RUN_END(__IRQ_STACK_END)
.fiqstack : { . = . + __FIQ_STACK_SIZE; } align(8)
RUN_START(__FIQ_STACK_START)
RUN_END(__FIQ_STACK_END)
.svcstack : { . = . + __SVC_STACK_SIZE; } align(8)
RUN_START(__SVC_STACK_START)
RUN_END(__SVC_STACK_END)
.abortstack : { . = . + __ABORT_STACK_SIZE; } align(8)
RUN_START(__ABORT_STACK_START)
RUN_END(__ABORT_STACK_END)
.undefinedstack : { . = . + __UNDEFINED_STACK_SIZE; } align(8)
RUN_START(__UNDEFINED_STACK_START)
RUN_END(__UNDEFINED_STACK_END)
} > OCRAM
GROUP : {
.ARM.exidx : { } palign(8)
.init_array : { } palign(8)
.fini_array : { } palign(8)
} > OCRAM
.bss.user_shared_mem (NOLOAD) : { } > USER_SHM_MEM
.bss.log_shared_mem (NOLOAD) : { } > LOG_SHM_MEM
.bss.ipc_vring_mem (NOLOAD) : { } > RTOS_NORTOS_IPC_SHM_MEM
.bss.sipc_hsm_queue_mem (NOLOAD) : { } > MAILBOX_HSM
.bss.sipc_secure_host_queue_mem (NOLOAD) : { } > MAILBOX_R5F
GROUP : {
.bss.nocache : { } align(8)
} > NON_CACHE_MEM
armiqmath : { } > R5F_TCMA , palign(8)
}
MEMORY
{
R5F_VECS : ORIGIN = 0x0 , LENGTH = 0x40
R5F_TCMA : ORIGIN = 0x40 , LENGTH = 0x7FC0
R5F_TCMB : ORIGIN = 0x80000 , LENGTH = 0x8000
SBL : ORIGIN = 0x70000000 , LENGTH = 0x40000
OCRAM : ORIGIN = 0x70040000 , LENGTH = 0xC0000
NON_CACHE_MEM : ORIGIN = 0x70100000 , LENGTH = 0x8000
USER_SHM_MEM : ORIGIN = 0x70150000 , LENGTH = 0x4000
LOG_SHM_MEM : ORIGIN = 0x70154000 , LENGTH = 0x4000
FLASH : ORIGIN = 0x60100000 , LENGTH = 0x80000
RTOS_NORTOS_IPC_SHM_MEM : ORIGIN = 0x72000000 , LENGTH = 0x3E80
MAILBOX_HSM : ORIGIN = 0x44000000 , LENGTH = 0x3CE
MAILBOX_R5F : ORIGIN = 0x44000400 , LENGTH = 0x3CE
/* For memory Regions not defined in this core but shared by other cores with the current core */
}
The results of running the code with and without TCM are as follows:
[Cortex_R5_0] NOT USE TCM
sine test
assembly IQsin:-2026
sqrt test
assembly sqrt:-1075
mult test
assembly mpy:-331
div test
assebmly iq div:-995
[Cortex_R5_0] USE TCM
sine test
assembly IQsin:-2195
sqrt test
assembly sqrt:-1420
mult test
assembly mpy:-630
div test
assebmly iq div:-1270
The results showed that the function ran faster in OCRAM; why is this?
I have uploaded the iqmathLib. h file that needs to be included in the code in the attachment