Tool/software:
I'm comparing a few different methods of passing data to/from the CLA co-processor and the C28x main processor. I'd like for the data transfer to be extensible, and therefore use a memcpy-like routine to perform the copy, however, I've run into the expected issues with the CLA and it's lack of a RPTB instruction. The methods I'm looking at are:
void element_by_element_copy(struct_t * in, struct* out){ out->field_1 = in->field_2; ... ... out->field_n = in->field_n; } void memcpy(struct_t * in, struct* out){ uint32_t *src = in; uint32_t *dst = in; for (uint16_t i = 0; i < (sizeof(struct_t) / 2); i++){ dst[i] = src[i]; } } void unroll_memcpy(struct_t * in, struct* out){ uint32_t *src = in; uint32_t *dst = in; #pragma UNROLL(sizeof(struct_t) / 2) for (uint16_t i = 0; i < (sizeof(struct_t) / 2); i++){ dst[i] = src[i]; } }
As expected, the element-by-element copy is the most performant, but I noticed that using the UNROLL pragma with a for-loop results in nearly the same code. The only differences seems to be that when unrolling the loop, the CLA compiler duplicates the unnecessary MNOP instructions needed for the branch. See sample assembly output below:
/* Assembly generation - Element by Element Copy */ ; MAR0 assigned to dst_buff; ; Copy first element MMOV32 MR0,@src_buff ; [CPU_FPU] MMOV32 *MAR0,MR0 ; [CPU_FPU] ; Copy second element MMOV32 MR0,@src_buff+2 ; [CPU_FPU] MMOV32 *MAR0+[#2],MR0 ; [CPU_FPU] ... ... ; Copy nth element MMOV32 MR0,@src_buff+n ; [CPU_FPU] MMOV32 *MAR0+[#n],MR0 ; [CPU_FPU] /* Assembly generation - Unrolled For Loop */ ; MAR0 assigned to dst_buff; ; Copy 1st 32 bits MMOV32 MR0,@src_buff ; [CPU_FPU] MMOV32 *MAR0,MR0 ; [CPU_FPU] MNOP ; [CPU_FPU] MNOP ; [CPU_FPU] MNOP ; [CPU_FPU] ; Second 32 bits MMOV32 MR0,@src_buff+2 ; [CPU_FPU] MMOV32 *MAR0+[#2],MR0 ; [CPU_FPU] MNOP ; [CPU_FPU] MNOP ; [CPU_FPU] MNOP ; [CPU_FPU] ... ... ; Copy nth 32 bits MMOV32 MR0,@src_buff+n ; [CPU_FPU] MMOV32 *MAR0+[#n],MR0 ; [CPU_FPU]
So, my question is, is there any way to force the CLA compiler to remove these NOP instructions when unrolling the loop? I'm assuming the NOPs are injected in case the loop is only partially unrolled, but in our use case, it's preferable to fully unroll the loop as this needs to be optimized for speed rather than code size.
I tried to reproduce your results. But I don't see ...
the element-by-element copy is the most performant, but I noticed that using the UNROLL pragma with a for-loop results in nearly the same code.
Please attach one source file I can build down to assembly. Note it does not have to run. I only need to inspect the generated assembly. Copy and paste the text of the compiler options exactly as the compiler sees them. Do not use a screenshot. Also tell me the version of the compiler.
Thanks and regards,
-George
Hi George,
Compiler version: v22.6.0.LTS
Compiler invocation:
cl2000 -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 \ --idiv_support=none --isr_save_vcu_regs=off --tmu_support=tmu0 \ --vcu_support=vcu2 -O2 --opt_for_speed=2 --fp_mode=relaxed --fp_reassoc=off \ --include_path="C:/ti/ccs1200/ccs/tools/compiler/ti-cgt-c2000_22.6.0.LTS/include" \ --advice:performance=none -g --symdebug:dwarf_version=4 --c99 --relaxed_ansi \ --float_operations_allowed=all --fp_single_precision_constant --diag_warning=225 \ --diag_wrap=off --display_error_number --issue_remarks --quiet --abi=eabi \ --cla_background_task=off --cla_signed_compare_workaround=off \ --silicon_errata_fpu1_workaround=on --disable_inlining -k --parallel=8 \ --obj_directory="source/manual" "copy_from_msgram.cla"
Not sure why, but it doesn't seem like I'm able to upload a file. I've placed the contents of my test file in the code block below instead (confirmed to compile and can verify the generated assembly results still produce the issue I'm discussing on my machine, hopefully on yours as well.).
Setting the COPY_METHOD macro to ELEMENT_BY_ELEMENT , FOR_LOOP, UNROLL, or MANUAL_UNROLL shows the behavior I'm observing.
File: copy_from_msgram.cla
// Sample file for reproducing issues with CLA compiler for-loop UNROLLs #include <stdint.h> #include <float.h> typedef struct struct_group1 { float val_1; float val_2; float val_3; float val_4; } group1_t; typedef struct struct_group2 { float val_1; float val_2; float val_3; } group2_t; typedef struct struct_c28x_to_cla { uint16_t val_1; float val_2; float val_3; float val_4; float val_5; group1_t val_6; uint16_t val_7; float val_8; uint16_t val_9; float val_10; float val_11; float val_12; float val_13; uint16_t val_14; float val_15; float val_16; float val_17; uint16_t val_18; uint16_t val_19; group2_t val_20; } c28x_to_cla_t; c28x_to_cla_t global_buff; // Test Configurations #define ELEMENT_BY_ELEMENT 1 #define UNROLL 2 #define FOR_LOOP 3 #define MANUAL_UNROLL 4 #define COPY_METHOD MANUAL_UNROLL void copy_from_msgram(c28x_to_cla_t *data_out) { #if (COPY_METHOD == UNROLL) || (COPY_METHOD == FOR_LOOP) // For Loop implementations uint32_t *dst = (uint32_t *)data_out; uint32_t *src = (uint32_t *)&global_buff; #if (COPY_METHOD == UNROLL) // UNROLL #pragma UNROLL(sizeof(c28x_to_cla_t) / 2) // Expect to always to be 32 bit aligned. #endif for (uint16_t i = 0; i < (sizeof(c28x_to_cla_t) / 2); i++){ dst[i] = src[i]; } #elif (COPY_METHOD == MANUAL_UNROLL) // Manual unrolling uint32_t *dst = (uint32_t *)data_out; uint32_t *src = (uint32_t *)&global_buff; // Expect sizeof(global_buff) == 48. So, 24 32 bit values to copy. dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; dst[8] = src[8]; dst[9] = src[9]; dst[10] = src[10]; dst[11] = src[11]; dst[12] = src[12]; dst[13] = src[13]; dst[14] = src[14]; dst[15] = src[15]; dst[16] = src[16]; dst[17] = src[17]; dst[18] = src[18]; dst[19] = src[19]; dst[20] = src[20]; dst[21] = src[21]; dst[22] = src[22]; dst[23] = src[23]; #else /*Element by Element Assignment */ data_out->val_1 = global_buff.val_1; data_out->val_2 = global_buff.val_2; data_out->val_3 = global_buff.val_3; data_out->val_4 = global_buff.val_4; data_out->val_5 = global_buff.val_5; data_out->val_6.val_1 = global_buff.val_6.val_1; data_out->val_6.val_2 = global_buff.val_6.val_2; data_out->val_6.val_3 = global_buff.val_6.val_3; data_out->val_6.val_4 = global_buff.val_6.val_4; data_out->val_7 = global_buff.val_7; data_out->val_8 = global_buff.val_8; data_out->val_9 = global_buff.val_9; data_out->val_10 = global_buff.val_10; data_out->val_11 = global_buff.val_11; data_out->val_12 = global_buff.val_12; data_out->val_13 = global_buff.val_13; data_out->val_14 = global_buff.val_14; data_out->val_15 = global_buff.val_15; data_out->val_16 = global_buff.val_16; data_out->val_17 = global_buff.val_17; data_out->val_18 = global_buff.val_18; data_out->val_19 = global_buff.val_19; data_out->val_20.val_1 = global_buff.val_20.val_1; data_out->val_20.val_2 = global_buff.val_20.val_2; data_out->val_20.val_3 = global_buff.val_20.val_3; #endif } /* End of function copy_from_msgram */
Thanks for looking into this with me,
- Taylor
I found a workaround. But I can only give a partial explanation for it.
Add the restrict keyword to the definition of the pointers ...
uint32_t * restrict dst = (uint32_t *)data_out; uint32_t * restrict src = (uint32_t *)&global_buff;
Without that, the compiler must presume expressions like dst[0] and src[1] could reference the same address. If they refer to the same address, then the extra MNOP instructions are required to allow the write to dst[0] to complete before the read of src[1] can begin. I don't know the pipeline behavior of CLA well enough to give more specific details.
Thanks and regards,
-George