Hello all,
Apologies for the length of this post, it's actually not very complicated, just trying to give a precise description of the issue.
I'm been struggling to optimize a bit of code which I know in my gut can be faster, but having a hard time getting the compiler to cooperate. Target is the F28377D, CCS version 10.4, compiler version 20.2.5.
Basically I have a high speed control loop which uses both CPU1 and CLA1. On each iteration of the control loop, I want to store several variables of interest in log arrays (length 2000) so I can plot and analyze them later.
The variables to be logged come from CLA1. They all fit within a single data page, which is in the ClaToCpu message RAM. The log arrays are in GSRAM, due to their size. On each iteration, CPU1 will take each of the variables and append them to the log arrays. My original implementation used a pointer for each log array whose address would increment each iteration. The pointers are also all in one data page, but in GSRAM.
Here's the basic declarations for the variables, the log arrays, and the pointers:
// the variables to be logged. Written by the CLA extern volatile float IB_ClaToCpu; extern volatile float VB_ClaToCpu; extern volatile float IBUCK_ClaToCpu; extern volatile float VBUCK_ClaToCpu; // this one is actually sourced from CPU float var_Ref_CpuToCla; // my log arrays #define log_longs 2000 //length of logs, assuming 32b data type float VB_log[(log_longs)]; float IB_log[(log_longs)]; float IBUCK_log[(log_longs)]; float VBUCK_log[(log_longs)]; float Iref_Analog_log[(log_longs)]; // the pointers to the logs float * IBUCK_log_ptr; float * VBUCK_log_ptr; float * IB_log_ptr; float * VB_log_ptr; float * Iref_Analog_log_ptr;
Here's the snippets of the map file, just to prove the variables and pointers are packed into just two data pages (you can see there are even more variables and logs than the ones mentioned above):
GLOBAL DATA SYMBOLS: SORTED BY DATA PAGE address data page name -------- ---------------- ---- 00001480 52 (00001480) _GPT_ClaToCpu 0000148e 52 (00001480) _WaveStats_ClaToCpu 0000149e 52 (00001480) _IB_ClaToCpu 0000149f 52 (00001480) _IBUCK_ClaToCpu 000014a0 52 (00001480) _VB_ClaToCpu 000014a2 52 (00001480) _VBUCK_ClaToCpu 000014a4 52 (00001480) _var_Ref_ClaScratch 000014a6 52 (00001480) _var_Errn_ClaScratch 000014a8 52 (00001480) _Comp_Out_ClaToCpu 000014aa 52 (00001480) _FF_Out_ClaToCpu 000014ac 52 (00001480) _DutyFine_ClaToCpu ............ 00008a06 228 (00008a00) _active_pulse_idx 00008a28 228 (00008a00) _IBUCK_log_ptr 00008a2a 228 (00008a00) _VBUCK_log_ptr 00008a2c 228 (00008a00) _IB_log_ptr 00008a2e 228 (00008a00) _VB_log_ptr 00008a30 228 (00008a00) _Iref_Analog_log_ptr 00008a32 228 (00008a00) _PFWD_log_ptr 00008a34 228 (00008a00) _PREV_log_ptr 00008a36 228 (00008a00) _GAMMA_DEG_log_ptr 00008a38 228 (00008a00) _Iref_log_ptr
My little function which actually does the logging looks like this:
static inline void log_waves(void)
{
*IB_log_ptr++=IB_ClaToCpu;
*VB_log_ptr++=VB_ClaToCpu;
*IBUCK_log_ptr++=IBUCK_ClaToCpu;
*VBUCK_log_ptr++=VBUCK_ClaToCpu;
*Iref_Analog_log_ptr++=var_Ref_CpuToCla;
}
And it produces this disassembly:
$C$L28:
MOVW DP,#_IB_ClaToCpu ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 97,column 13,is_stmt,isa 0
UI16TOF32 R1H,@_IB_ClaToCpu ; [CPU_FPU] |97|
MOVW DP,#_VB_ClaToCpu ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 98,column 13,is_stmt,isa 0
MOVL XT,@_VB_ClaToCpu ; [CPU_ALU] |98|
MOVW DP,#_IB_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 97,column 13,is_stmt,isa 0
MOVL XAR4,@_IB_log_ptr ; [CPU_ALU] |97|
MOVW DP,#_IBUCK_ClaToCpu ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 99,column 13,is_stmt,isa 0
UI16TOF32 R0H,@_IBUCK_ClaToCpu ; [CPU_FPU] |99|
MOVW DP,#_VB_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 98,column 13,is_stmt,isa 0
MOVL XAR5,@_VB_log_ptr ; [CPU_ALU] |98|
MOVW DP,#_IBUCK_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 99,column 13,is_stmt,isa 0
MOVL XAR6,@_IBUCK_log_ptr ; [CPU_ALU] |99|
MOVW DP,#_VBUCK_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 100,column 13,is_stmt,isa 0
MOVL XAR7,@_VBUCK_log_ptr ; [CPU_ALU] |100|
MOVW DP,#_VBUCK_ClaToCpu ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 97,column 13,is_stmt,isa 0
MOV32 *XAR4++,R1H ; [CPU_FPU] |97|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 100,column 13,is_stmt,isa 0
MOVL ACC,@_VBUCK_ClaToCpu ; [CPU_ALU] |100|
MOVW DP,#_Iref_Analog_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 102,column 13,is_stmt,isa 0
MOVL XAR0,@_Iref_Analog_log_ptr ; [CPU_ALU] |102|
MOVW DP,#_var_Ref_CpuToCla ; [CPU_ARAU]
MOVL P,@_var_Ref_CpuToCla ; [CPU_ALU] |102|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 98,column 13,is_stmt,isa 0
MOVL *XAR5++,XT ; [CPU_ALU] |98|
MOVW DP,#_GPT_CpuToCla+6 ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 99,column 13,is_stmt,isa 0
MOV32 *XAR6++,R0H ; [CPU_FPU] |99|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 100,column 13,is_stmt,isa 0
MOVL *XAR7++,ACC ; [CPU_ALU] |100|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 113,column 13,is_stmt,isa 0
MOV AL,@$BLOCKED(_GPT_CpuToCla)+6 ; [CPU_ALU] |113|
MOVW DP,#_IB_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 97,column 13,is_stmt,isa 0
MOVL @_IB_log_ptr,XAR4 ; [CPU_ALU] |97|
MOVW DP,#_VB_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 102,column 13,is_stmt,isa 0
MOVL *XAR0++,P ; [CPU_ALU] |102|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 98,column 13,is_stmt,isa 0
MOVL @_VB_log_ptr,XAR5 ; [CPU_ALU] |98|
MOVW DP,#_IBUCK_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 99,column 13,is_stmt,isa 0
MOVL @_IBUCK_log_ptr,XAR6 ; [CPU_ALU] |99|
MOVW DP,#_VBUCK_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 100,column 13,is_stmt,isa 0
MOVL @_VBUCK_log_ptr,XAR7 ; [CPU_ALU] |100|
MOVW DP,#_Iref_Analog_log_ptr ; [CPU_ARAU]
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 102,column 13,is_stmt,isa 0
MOVL @_Iref_Analog_log_ptr,XAR0 ; [CPU_ALU] |102|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 113,column 13,is_stmt,isa 0
BF $C$L35,NEQ ; [CPU_ALU] |113|
; branchcc occurs ; [] |113|
.dwpsn file "../Radden_source/main_control_loop_V201.c",line 348,column 9,is_stmt,isa 0
BF $C$L34,UNC ; [CPU_ALU] |348|
; branch occurs ; [] |348|
I count 37 instructions (excluding the BF ones at the end), 16 of which are DP loads. I expect DP to need to load when changing between access to the variables and the pointers, but some of these are obviously unnecessary, right?
My optimization level is set to -O2 (tried -O3, doesn't seem to affect anything about this code). I haven't been able to get -O4 to build, getting errors regarding code which a colleague wrote and haven't been able to figure out...
I've seen post-link optimization mentioned in some other e2e posts and some TI literature. I'm unsure if I'm building with post-link optimization. In the project settings, I see the same window as vesgine reported in this thread (just the --plink_advice_only flag).
Here are my flags for the compiler and linker (when using O3):
-v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O3 --opt_for_speed=5 --fp_mode=relaxed --include_path="C:/Workspace/Git/RFPA_Radden/MCU/Radden" --include_path="C:/Workspace/Git/RFPA_Radden/MCU/Radden/IIR coefficient sets/" --include_path="C:/Workspace/Git/RFPA_Radden/MCU/Radden/Radden_source/" --include_path="C:/Workspace/Git/RFPA_Radden/MCU/Radden/trivial_chainloader_source/" --include_path="C:/Workspace/Git/RFPA_Radden/MCU/Radden/uart telemetry source/" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/device_support/f2837xd/common/source" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/driverlib/f2837xd/driverlib" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/device_support/f2837xd/headers/include" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/device_support/f2837xd/common/include" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/libraries/calibration/hrpwm/f2837xd/include" --include_path="C:/ti/c2000/C2000Ware_1_00_05_00/device_support/f2837xd/common/include" --include_path="C:/ti/ccs1040/ccs/tools/compiler/ti-cgt-c2000_20.2.5.LTS/include" --advice:performance=all --define=_FLASH --define=GOLDEN_IMAGE --define=_SHORT_LOGS --define=_DUAL_HEADERS --define=CPU1 -g --diag_warning=225 --diag_wrap=off --display_error_number --issue_remarks --gen_func_subsections=on --abi=coffabi -k --asm_listing --c_src_interlist
-v28 -ml -mt --cla_support=cla1 --float_support=fpu32 --tmu_support=tmu0 --vcu_support=vcu2 -O3 --opt_for_speed=5 --fp_mode=relaxed --advice:performance=all --define=_FLASH --define=GOLDEN_IMAGE --define=_SHORT_LOGS --define=_DUAL_HEADERS --define=CPU1 -g --diag_warning=225 --diag_wrap=off --display_error_number --issue_remarks --gen_func_subsections=on --abi=coffabi -k --asm_listing --c_src_interlist -z -m"Radden_golden.map" --stack_size=0x800 --warn_sections -i"C:/ti/ccs1040/ccs/tools/compiler/ti-cgt-c2000_20.2.5.LTS/lib" -i"C:/ti/c2000/C2000Ware_1_00_05_00/driverlib/f2837xd/driverlib/inc" -i"C:/ti/c2000/C2000Ware_1_00_05_00/driverlib/f2837xd/driverlib" -i"C:/ti/ccs1040/ccs/tools/compiler/ti-cgt-c2000_20.2.5.LTS/include" --reread_libs --define=CLA_C=1 --define=GOLDEN_IMAGE --diag_wrap=off --display_error_number --xml_link_info="Radden_RFPA_MCU_linkInfo.xml" --rom_model
Also, one thing I noticed is that when I try to build with optimization set to O4, the following pops up in the Advice window, for many files including the one with the relevant code:
Description Resource Path Location #1463-D Link-time optimization is disabled for this file due to the use of inline assembly main_control_loop_V201.c /Radden_RFPA_MCU/Radden_source line 185
Seems like an interesting clue. Does Link-time optimization only happen at O4? I think I recall that being mentioned somewhere, but can't find it now.
There are several other examples of seemingly-excessive DP loads, I'm betting there's some common solution to all of them. Any help would be appreciated.
Regards,
Mike