This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

MACF32 on F28335

Other Parts Discussed in Thread: CONTROLSUITE, TMS320F28335

Hi,

I need to periodically perform 128 multiply and accumulate instructions on a F28335. The issue is, that the actual speed is much too low.

I read about the MACF32 function which seems to be what I am searching for. As instructed in the ControlSuite example, I enabled fpu32 support and added the "rts2800_fpu32.lib" to the file search path in the project settings. Additionally I changed the optimization level to 3. Unfortunately, the instructions in the following loop did not convert to a MACF28 instruction in the disassembly view.

// end,i -> integer
// a     -> float
// b,c   -> float array

for (i=0; i<end; i++){
   
   a += (b[i] * c[i]);

} 

Any suggestions ? Thank you in advance.

~Brian

  • Brian Walsh said:
    Additionally I changed the optimization level to 3. Unfortunately, the instructions in the following loop did not convert to a MACF28 instruction in the disassembly view.

    I just created the following test case for a TMS320F28335 in CCS 6.1.3 using compiler v15.12.2.LTS

    /*
     * main.c
     */
    
    #define VECTOR_LENGTH 128
    float result;
    float vec_b[VECTOR_LENGTH];
    float vec_c[VECTOR_LENGTH];
    
    static float mac_test (float *const b, float *const c, const int end)
    {
    	float a = 0.0f;
    	int i;
    
    	for (i = 0; i < end; i++)
    	{
    		a += (b[i] * c[i]);
    	}
    
    	return a;
    }
    
    int main(void) {
    	result = mac_test (vec_b, vec_c, VECTOR_LENGTH);
    	
    	return 0;
    }
    

    The Optimization Level was set 3, floating point support set to fpu32 and the Runtime support library left at <automatic>.

    The generated assembler listing showed a MACF32 instruction generated:

    	.dwpsn	file "../main.c",line 17,column 3,is_stmt,isa 0
            ZERO      R7H                   ; [CPU_] |17| 
            RPT       #127
    ||      MACF32   R7H,R3H,*XAR4++,*XAR7++ ; [CPU_] |17| 
            ADDF32    R3H,R3H,R2H           ; [CPU_] |17| 
            ADDF32    R2H,R7H,R6H           ; [CPU_] |17| 
            MOVW      DP,#_result           ; [CPU_U] 
    

    The CCS project is attached TMS320F28335_MACF28.zip

    I haven't tested the program, but by comparing to yours may give a clue why you don't get the MACF28 instruction generated.

  • Hi Chester,

    thank you very much for your reply.

    Your example works ! But if I do not use an extra function for the MAC, there is no optimization.

    #define VECTOR_LENGTH 128
    float result;
    float vec_b[VECTOR_LENGTH];
    float vec_c[VECTOR_LENGTH];
    
    
    int main(void) {
    
    	int i;
    
    	for (i = 0; i < VECTOR_LENGTH; i++)
    	{
    		a += (b[i] * c[i]);
    	}
    	
    	return 0;
    }
    


    What makes the difference ?

    Thank you.

    ~Brian

  • Brian Walsh said:
    What makes the difference ?

    To help investigate I enabled the source interlist "Generated interlisted assembly file (--src_interlist, -s)" which adds comments about the translated C code.

    With the code in the mac_test function the compiler had translated the code to use an (undocumented) __parallel_mpy_add_f32 intrinsic which got  generated as a MACF32 instruction.

    I think the difference is cause by if the MAC result is accumulated in a local variable on the stack, or directly in a global variable.

    With the main function example accumulating the result in a global variable:

    #define VECTOR_LENGTH 128
    float result;
    float vec_b[VECTOR_LENGTH];
    float vec_c[VECTOR_LENGTH];
    
    int main(void) {
        int i;
    
        result = 0.0f;
        for (i = 0; i < VECTOR_LENGTH; i++)
        {
        	result += (vec_b[i] * vec_c[i]);
        }
    
    	return 0;
    }
    

    There is no MACF32 instruction generated:

    ;*** 13	-----------------------    result = 0.0F;
    ;***  	-----------------------    U$13 = &vec_c[0];
    ;***  	-----------------------    U$9 = &vec_b[0];
    ;***  	-----------------------    #pragma MUST_ITERATE(128, 128, 128)
    ;***  	-----------------------    #pragma LOOP_FLAGS(4096u)
    ;***  	-----------------------    L$1 = 127;
    ;***	-----------------------g2:
    ;*** 16	-----------------------    result += *U$9++**U$13++;
    ;*** 14	-----------------------    if ( (--L$1) != (-1) ) goto g2;
    ;*** 19	-----------------------    return 0;
    	.dwpsn	file "../main.c",line 13,column 5,is_stmt,isa 0
            ZERO      R0H                   ; [CPU_] |13| 
            MOVW      DP,#_result           ; [CPU_U] 
            MOVB      XAR6,#127             ; [CPU_] 
            MOVL      XAR4,#_vec_c          ; [CPU_U] 
            MOVL      XAR5,#_vec_b          ; [CPU_U] 
            MOV32     @_result,R0H          ; [CPU_] |13| 
    	.dwpsn	file "../main.c",line 14,column 17,is_stmt,isa 0
            RPTB      $C$L2,AR6             ; [CPU_] |14| 
            ; repeat block starts ; [] 
    $C$L1:    
    	.dwpsn	file "../main.c",line 16,column 6,is_stmt,isa 0
            MOV32     R0H,*XAR5++           ; [CPU_] |16| 
            MOV32     R1H,*XAR4++           ; [CPU_] |16| 
            MPYF32    R1H,R1H,R0H           ; [CPU_] |16| 
            MOV32     R3H,@_result          ; [CPU_] |16| 
            ADDF32    R0H,R1H,R3H           ; [CPU_] |16| 
            NOP       ; [CPU_] 
            MOV32     @_result,R0H          ; [CPU_] |16| 
            ; repeat block ends ; [] 
    $C$L2:    
    

    The main function example was modified to accumulate the result in a local variable:

    #define VECTOR_LENGTH 128
    float result;
    float vec_b[VECTOR_LENGTH];
    float vec_c[VECTOR_LENGTH];
    
    int main(void) {
        int i;
        float temp;
    
        temp = 0.0f;
        for (i = 0; i < VECTOR_LENGTH; i++)
        {
        	temp += (vec_b[i] * vec_c[i]);
        }
        result = temp;
    
    	return 0;
    }
    

    Which resulted in a MACF32 instruction being generated:

    ;***  	-----------------------    U$13 = &vec_c[0];
    ;***  	-----------------------    U$9 = &vec_b[0];
    ;*** 14	-----------------------    temp = T$2 = 0.0F;
    ;***  	-----------------------    #pragma MUST_ITERATE(128, 128, 128)
    ;***  	-----------------------    #pragma LOOP_FLAGS(4096u)
    ;***  	-----------------------    L$1 = 127;
    ;***	-----------------------g2:
    ;*** 17	-----------------------    __parallel_mpy_add_f32(*U$9++, *U$13++, &(temp), &(T$2));
    ;*** 15	-----------------------    if ( (--L$1) != (-1) ) goto g2;
    ;*** 19	-----------------------    temp += T$2;
    ;*** 19	-----------------------    result = temp;
    ;*** 21	-----------------------    return 0;
            MOV32     *SP++,R6H             ; [CPU_] 
    	.dwpsn	file "../main.c",line 14,column 5,is_stmt,isa 0
            ZERO      R2H                   ; [CPU_] |14| 
            MOVL      XAR7,#_vec_c          ; [CPU_U] 
            MOVL      XAR4,#_vec_b          ; [CPU_U] 
    	.dwpsn	file "../main.c",line 17,column 6,is_stmt,isa 0
            ZERO      R6H                   ; [CPU_] |17| 
    	.dwpsn	file "../main.c",line 14,column 5,is_stmt,isa 0
            MOV32     R3H,R2H               ; [CPU_] |14| 
            MOV32     *SP++,R7H             ; [CPU_] 
    	.dwpsn	file "../main.c",line 21,column 2,is_stmt,isa 0
            MOVB      AL,#0                 ; [CPU_] |21| 
    	.dwpsn	file "../main.c",line 17,column 6,is_stmt,isa 0
            ZERO      R7H                   ; [CPU_] |17| 
            RPT       #127
    ||      MACF32   R7H,R3H,*XAR4++,*XAR7++ ; [CPU_] |17| 
            ADDF32    R3H,R3H,R2H           ; [CPU_] |17| 
            ADDF32    R2H,R7H,R6H           ; [CPU_] |17| 
            MOVW      DP,#_result           ; [CPU_U] 
    	.dwpsn	file "../main.c",line 19,column 5,is_stmt,isa 0
            ADDF32    R3H,R3H,R2H           ; [CPU_] |19| 
            MOV32     R7H,*--SP             ; [CPU_] 
            MOV32     @_result,R3H          ; [CPU_] |19| 
            MOV32     R6H,*--SP             ; [CPU_] 
    

     To get an answer about the reason for the difference, suggest you ask the experts on the TI C/C++ Compiler - Forum. My guess is due to the MACF32 instruction needs to accumulate its result in a register.