TMS320F28379D: CLA Loop unrolling injects unnecessary NOP instructions

Taylor Rogers

Part Number: TMS320F28379D

Tool/software:

I'm comparing a few different methods of passing data to/from the CLA co-processor and the C28x main processor. I'd like for the data transfer to be extensible, and therefore use a memcpy-like routine to perform the copy, however, I've run into the expected issues with the CLA and it's lack of a RPTB instruction. The methods I'm looking at are:

void element_by_element_copy(struct_t * in, struct* out){
    out->field_1 = in->field_2;
    ...
    ...
    out->field_n = in->field_n;
}

void memcpy(struct_t * in, struct* out){
    uint32_t *src = in;
    uint32_t *dst = in;
    for (uint16_t i = 0; i < (sizeof(struct_t) / 2); i++){
        dst[i] = src[i];
    }
}

void unroll_memcpy(struct_t * in, struct* out){
    uint32_t *src = in;
    uint32_t *dst = in;
    
    #pragma UNROLL(sizeof(struct_t) / 2)
    for (uint16_t i = 0; i < (sizeof(struct_t) / 2); i++){
        dst[i] = src[i];
    }
}

As expected, the element-by-element copy is the most performant, but I noticed that using the UNROLL pragma with a for-loop results in nearly the same code. The only differences seems to be that when unrolling the loop, the CLA compiler duplicates the unnecessary MNOP instructions needed for the branch. See sample assembly output below:

/* Assembly generation - Element by Element Copy */
        ; MAR0 assigned to dst_buff;
        
        ; Copy first element
        MMOV32    MR0,@src_buff         ; [CPU_FPU]
        MMOV32    *MAR0,MR0             ; [CPU_FPU]
	    ; Copy second element
        MMOV32    MR0,@src_buff+2       ; [CPU_FPU]
        MMOV32    *MAR0+[#2],MR0        ; [CPU_FPU]
	    ...
	    ...
	    ; Copy nth element
        MMOV32    MR0,@src_buff+n       ; [CPU_FPU]
        MMOV32    *MAR0+[#n],MR0        ; [CPU_FPU]
        
        
/* Assembly generation - Unrolled For Loop */
        ; MAR0 assigned to dst_buff;
        
        ; Copy 1st 32 bits
        MMOV32    MR0,@src_buff         ; [CPU_FPU]
        MMOV32    *MAR0,MR0             ; [CPU_FPU]
        MNOP      ; [CPU_FPU] 
        MNOP      ; [CPU_FPU] 
        MNOP      ; [CPU_FPU]
        ; Second 32 bits
        MMOV32    MR0,@src_buff+2       ; [CPU_FPU]
        MMOV32    *MAR0+[#2],MR0        ; [CPU_FPU]
        MNOP      ; [CPU_FPU] 
        MNOP      ; [CPU_FPU] 
        MNOP      ; [CPU_FPU]
        ...
	    ...
	    ; Copy nth 32 bits
        MMOV32    MR0,@src_buff+n       ; [CPU_FPU]
        MMOV32    *MAR0+[#n],MR0        ; [CPU_FPU]

So, my question is, is there any way to force the CLA compiler to remove these NOP instructions when unrolling the loop? I'm assuming the NOPs are injected in case the loop is only partially unrolled, but in our use case, it's preferable to fully unroll the loop as this needs to be optimized for speed rather than code size.

7 months ago

0 George Mock 7 months ago

TI__Guru**** 250710 points

I tried to reproduce your results. But I don't see ...

Taylor Rogers said:
the element-by-element copy is the most performant, but I noticed that using the UNROLL pragma with a for-loop results in nearly the same code.

Please attach one source file I can build down to assembly. Note it does not have to run. I only need to inspect the generated assembly. Copy and paste the text of the compiler options exactly as the compiler sees them. Do not use a screenshot. Also tell me the version of the compiler.

Thanks and regards,

-George

0 Taylor Rogers 7 months ago in reply to George Mock

Prodigy 30 points

Hi George,

Compiler version: v22.6.0.LTS

Compiler invocation:

cl2000 -v28 -ml -mt --cla_support=cla1 --float_support=fpu32 \
--idiv_support=none --isr_save_vcu_regs=off --tmu_support=tmu0 \
--vcu_support=vcu2 -O2 --opt_for_speed=2 --fp_mode=relaxed --fp_reassoc=off \
--include_path="C:/ti/ccs1200/ccs/tools/compiler/ti-cgt-c2000_22.6.0.LTS/include" \
--advice:performance=none -g --symdebug:dwarf_version=4 --c99 --relaxed_ansi \
--float_operations_allowed=all --fp_single_precision_constant --diag_warning=225 \
--diag_wrap=off --display_error_number --issue_remarks --quiet --abi=eabi \
--cla_background_task=off --cla_signed_compare_workaround=off \
--silicon_errata_fpu1_workaround=on --disable_inlining -k --parallel=8 \
--obj_directory="source/manual"  "copy_from_msgram.cla"

Not sure why, but it doesn't seem like I'm able to upload a file. I've placed the contents of my test file in the code block below instead (confirmed to compile and can verify the generated assembly results still produce the issue I'm discussing on my machine, hopefully on yours as well.).

Setting the COPY_METHOD macro to ELEMENT_BY_ELEMENT , FOR_LOOP, UNROLL, or MANUAL_UNROLL shows the behavior I'm observing.

File: copy_from_msgram.cla

// Sample file for reproducing issues with CLA compiler for-loop UNROLLs


#include <stdint.h>
#include <float.h>


typedef struct struct_group1 {
    float val_1;
    float val_2;
    float val_3;
    float val_4;
  } group1_t;

typedef struct struct_group2 {
    float val_1;
    float val_2;
    float val_3;
  } group2_t;
typedef struct struct_c28x_to_cla {
    uint16_t val_1;
    float        val_2;
    float        val_3;
    float        val_4;
    float        val_5;
    group1_t     val_6;
    uint16_t val_7;
    float        val_8;
    uint16_t val_9;
    float        val_10;
    float        val_11;
    float        val_12;
    float        val_13;
    uint16_t val_14;
    float        val_15;
    float        val_16;
    float        val_17;
    uint16_t val_18;
    uint16_t val_19;
    group2_t     val_20;
} c28x_to_cla_t;

c28x_to_cla_t global_buff;


// Test Configurations
#define ELEMENT_BY_ELEMENT 1
#define UNROLL 2
#define FOR_LOOP 3
#define MANUAL_UNROLL 4

#define COPY_METHOD MANUAL_UNROLL

void copy_from_msgram(c28x_to_cla_t *data_out) {

#if (COPY_METHOD == UNROLL) || (COPY_METHOD == FOR_LOOP)
    // For Loop implementations
    uint32_t *dst = (uint32_t *)data_out;
    uint32_t *src = (uint32_t *)&global_buff;

    #if (COPY_METHOD == UNROLL) // UNROLL
    #pragma UNROLL(sizeof(c28x_to_cla_t) / 2) // Expect to always to be 32 bit aligned.
    #endif

    for (uint16_t i = 0; i < (sizeof(c28x_to_cla_t) / 2); i++){
        dst[i] = src[i];
    }

#elif (COPY_METHOD == MANUAL_UNROLL)
    // Manual unrolling
    uint32_t *dst = (uint32_t *)data_out;
    uint32_t *src = (uint32_t *)&global_buff;

    // Expect sizeof(global_buff) == 48. So, 24 32 bit values to copy.
    dst[0] = src[0];
    dst[1] = src[1];
    dst[2] = src[2];
    dst[3] = src[3];
    dst[4] = src[4];
    dst[5] = src[5];
    dst[6] = src[6];
    dst[7] = src[7];
    dst[8] = src[8];
    dst[9] = src[9];
    dst[10] = src[10];
    dst[11] = src[11];
    dst[12] = src[12];
    dst[13] = src[13];
    dst[14] = src[14];
    dst[15] = src[15];
    dst[16] = src[16];
    dst[17] = src[17];
    dst[18] = src[18];
    dst[19] = src[19];
    dst[20] = src[20];
    dst[21] = src[21];
    dst[22] = src[22];
    dst[23] = src[23];

#else 
    /*Element by Element Assignment */
    data_out->val_1        = global_buff.val_1;
    data_out->val_2        = global_buff.val_2;
    data_out->val_3        = global_buff.val_3;
    data_out->val_4        = global_buff.val_4;
    data_out->val_5        = global_buff.val_5;
    data_out->val_6.val_1  = global_buff.val_6.val_1;
    data_out->val_6.val_2  = global_buff.val_6.val_2;
    data_out->val_6.val_3  = global_buff.val_6.val_3;
    data_out->val_6.val_4  = global_buff.val_6.val_4;
    data_out->val_7        = global_buff.val_7;
    data_out->val_8        = global_buff.val_8;
    data_out->val_9        = global_buff.val_9;
    data_out->val_10       = global_buff.val_10;
    data_out->val_11       = global_buff.val_11;
    data_out->val_12       = global_buff.val_12;
    data_out->val_13       = global_buff.val_13;
    data_out->val_14       = global_buff.val_14;
    data_out->val_15       = global_buff.val_15;
    data_out->val_16       = global_buff.val_16;
    data_out->val_17       = global_buff.val_17;
    data_out->val_18       = global_buff.val_18;
    data_out->val_19       = global_buff.val_19;
    data_out->val_20.val_1 = global_buff.val_20.val_1;
    data_out->val_20.val_2 = global_buff.val_20.val_2;
    data_out->val_20.val_3 = global_buff.val_20.val_3;
#endif

} /* End of function copy_from_msgram */

Thanks for looking into this with me,

- Taylor

+1 George Mock 7 months ago in reply to Taylor Rogers

TI__Guru**** 250710 points

I found a workaround. But I can only give a partial explanation for it.

Add the restrict keyword to the definition of the pointers ...

    uint32_t * restrict dst = (uint32_t *)data_out;
    uint32_t * restrict src = (uint32_t *)&global_buff;

Without that, the compiler must presume expressions like dst[0] and src[1] could reference the same address. If they refer to the same address, then the extra MNOP instructions are required to allow the write to dst[0] to complete before the read of src[1] can begin. I don't know the pipeline behavior of CLA well enough to give more specific details.

Thanks and regards,

-George

C2000™︎ microcontrollers

C2000 microcontrollers forum

TMS320F28379D: CLA Loop unrolling injects unnecessary NOP instructions