This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

About writting better code for optimization

Hi there,

I'm testing the speed optimization in max level for de MSP430 Compiler 4.1, using CCS 5.4.

I coded a simple API pattern to use with GPIO. In the main loop, the code just toggles 3 leds in sequence. So I was hoping that, using max speed optimization level, the assembly code for the loop would be just XORs. But I get also in the assembly some pointer indirection overhead. I've tried to declare "const" as much as possible, but yet the compiler don't optimizes the pointer indirection.

Is there any other thing I should do to guarantee optimization?

The code below is my code example. It can be directly compiled in "main.c". It was intended for the experimenter's board. As you can see, the main loop just call the routines to toggle the leds.

So, the specific question is: Why can't the compiler optimize the function SetPins to an assembly XOR?


#include "msp430xg46x.h"
#include <stdint.h>
#include <stdlib.h>

/*
 * hal_gpio.h
 */
typedef struct Pin Pin;

typedef struct GPIO8 {
	void (* ConfigPinsAsGPIOOutput) (const Pin * const pin);
	void (* SetPins) (const Pin * const pin);
} GPIO8;

/*
 * hal_gpio.c
 */
typedef struct Pin {
	const void * hiddenPin;
} Pin;

/*
 * msp_gpio.h
 */
struct MSP_GPIO_t {
	GPIO8 GPIO8;
};

const struct MSP_GPIO_t MSP_GPIO;

/*
 * msp_gpio.c
 */
typedef struct Regs {
	volatile unsigned char * const dir;
	volatile unsigned char * const sel;
	volatile unsigned char * const out;
} Regs;

const Regs vRegs[] = {
	{
		&P2DIR,
		&P2SEL,
		&P2OUT},
	{
		&P5DIR,
		&P5SEL,
		&P5OUT},
};

typedef struct MspPin{
	const Regs * pRegs;
	uint8_t bitmask;
} MspPin;

void _ConfigPinsAsGPIOOutput(const MspPin * const pin) {
	*(pin->pRegs->dir) |= pin->bitmask;
	*(pin->pRegs->sel) &= ~(pin->bitmask);
}

void _SetPins(const MspPin * const pin) {
	*(pin->pRegs->out) ^= pin->bitmask;
}

void ConfigPinsAsGPIOOutput(const Pin * const args) {
	_ConfigPinsAsGPIOOutput(args->hiddenPin);
}
void SetPins(const Pin * const args) {
	_SetPins(args->hiddenPin);
}


const struct MSP_GPIO_t MSP_GPIO = {
	{
		&ConfigPinsAsGPIOOutput,
		&SetPins,}
};

/*
 * main.c
 */
volatile uint16_t i;

const GPIO8 * const Porta =  &MSP_GPIO.GPIO8;

const MspPin mp1 = {&vRegs[0], BIT1};
const MspPin mp2 = {&vRegs[0], BIT2};
const MspPin mp4 = {&vRegs[1], BIT1};

const Pin p1 = {&mp1};
const Pin p2 = {&mp2};
const Pin p4 = {&mp4};

int main(void)
{
	volatile uint16_t j;


	WDTCTL = WDTPW +WDTHOLD;              
	
    Porta->ConfigPinsAsGPIOOutput(&p1);
    Porta->ConfigPinsAsGPIOOutput(&p2);
    Porta->ConfigPinsAsGPIOOutput(&p4);

while(1) {
			Porta->SetPins(&p1);
			Porta->SetPins(&p2);
			Porta->SetPins(&p4);
			Porta->SetPins(&p1);
			Porta->SetPins(&p2);
			Porta->SetPins(&p4);
			Porta->SetPins(&p1);
			Porta->SetPins(&p2);
			Porta->SetPins(&p4);
			Porta->SetPins(&p1);
			Porta->SetPins(&p2);
			Porta->SetPins(&p4);
		}
}

EDIT: I've also tried inlining the functions and setting some advanced optimization options, but with no different results.
  • #include "msp430xg46x.h"
    #include <stdint.h>
    #include <stdlib.h>

    #define set_pin1 P2OUT ^= 2
    #define set_pin2 P2OUT ^= 4
    #define set_pin4 P5OUT ^= 2

    /*
     * hal_gpio.h
     */
    typedef struct Pin Pin;

    typedef struct GPIO8 {
        void (* ConfigPinsAsGPIOOutput) (const Pin * const pin);
        void (* SetPins) (const Pin * const pin);
    } GPIO8;

    /*
     * hal_gpio.c
     */
    typedef struct Pin {
        const void * hiddenPin;
    } Pin;

    /*
     * msp_gpio.h
     */
    struct MSP_GPIO_t {
        GPIO8 GPIO8;
    };

    const struct MSP_GPIO_t MSP_GPIO;

    /*
     * msp_gpio.c
     */
    typedef struct Regs {
        volatile unsigned char * const dir;
        volatile unsigned char * const sel;
        volatile unsigned char * const out;
    } Regs;

    const Regs vRegs[] = {
        {
            &P2DIR,
            &P2SEL,
            &P2OUT},
        {
            &P5DIR,
            &P5SEL,
            &P5OUT},
    };

    typedef struct MspPin{
        const Regs * pRegs;
        uint8_t bitmask;
    } MspPin;

    void _ConfigPinsAsGPIOOutput(const MspPin * const pin) {
        *(pin->pRegs->dir) |= pin->bitmask;
        *(pin->pRegs->sel) &= ~(pin->bitmask);
    }

    void _SetPins(const MspPin * const pin) {
        *(pin->pRegs->out) ^= pin->bitmask;
    }

    void ConfigPinsAsGPIOOutput(const Pin * const args) {
        _ConfigPinsAsGPIOOutput(args->hiddenPin);
    }
    void SetPins(const Pin * const args) {
        _SetPins(args->hiddenPin);
    }


    const struct MSP_GPIO_t MSP_GPIO = {
        {
            &ConfigPinsAsGPIOOutput,
            &SetPins,}
    };

    /*
     * main.c
     */
    volatile uint16_t i;

    const GPIO8 * const Porta =  &MSP_GPIO.GPIO8;

    const MspPin mp1 = {&vRegs[0], BIT1};
    const MspPin mp2 = {&vRegs[0], BIT2};
    const MspPin mp4 = {&vRegs[1], BIT1};

    const Pin p1 = {&mp1};
    const Pin p2 = {&mp2};
    const Pin p4 = {&mp4};

    int main(void)
    {
        volatile uint16_t j;


        WDTCTL = WDTPW +WDTHOLD;              
        
        Porta->ConfigPinsAsGPIOOutput(&p1);
        Porta->ConfigPinsAsGPIOOutput(&p2);
        Porta->ConfigPinsAsGPIOOutput(&p4);

    while(1) {
           set_pin1; // Porta->SetPins(&p1);
           set_pin2; // Porta->SetPins(&p2);
           set_pin4; // Porta->SetPins(&p4);
           set_pin1; // Porta->SetPins(&p1);
           set_pin2; // Porta->SetPins(&p2);
           set_pin4; // Porta->SetPins(&p4);
           set_pin1; // Porta->SetPins(&p1);
           set_pin2; // Porta->SetPins(&p2);
           set_pin4; // Porta->SetPins(&p4);
           set_pin1; // Porta->SetPins(&p1);
           set_pin2; // Porta->SetPins(&p2);
           set_pin4; // Porta->SetPins(&p4);
         }
    }

  • Really there are solutions infinitely easier to the toggling LEDs task, but that's not my point, so I'll treat your answer as a joke. Sincerely, I think it was very funny!

    The point is about the compiler not optimizing the functions to a single call to P2OUT ^= 0x02, and so on. The compiler should have no doubts on doing that, because I'm explicitly declaring the things as constants. Moreover, we are advised to thrust the compiler and to don't try to be smarter than it.

    Thanks for answering (not for the answer!) (I could'n lose the joke :P)!

  • You caught me! My response was indeed a joke.

    I think the c-compiler is a joke and the way people use and worship c is even a bigger joke. Jokes begets jokes.

  • old_cow_yellow said:
    You caught me! My response was indeed a joke.

    This whole thread is a joke :)

    First, writing library for such an easy and straightforward task as GPIO bit access is joke. Library by definition is performance-inefficient, especially when it's about (port I/O) bit manipulations.

    old_cow_yellow said:
    I think the c-compiler is a joke

    It's not. Unless only thing you make is digital 1$ wristwatches and calculators to sell zillions of them. Today uC flash memory is so dirt cheap that building anything more complex than mentioned wristwatch using assembler is more or less a joke.

  • Ilmars said:
    First, writing library for such an easy and straightforward task as GPIO bit access is joke.

    Well, GPIO port pin access is the simplest way to measure timings (didn't you just suggest it in a different thread?) and the meaning of this sample code was to test compiler optimization in general, not to optimize  I/O access.

    However, the tricky thing is that access to hardware registers is excluded form optimization, because they are declared volatile. And they have to, since reading or writing them may have side-effects and therefore the compiler must not optimize any access to them but instead perform any access in the exact order and number as written in the source code.

    So for testing optimization, use the port pins only to frame the test, and write the test so it doesn't access any hardware registers or contains other optimization blockers (like calling functions or using inline assembly)

  • Hi Jens-Michael, I appreciate very much your answer.

    The point is that, if you see the assembly of the loop part of the code, you get:

    Porta->SetPins(Led2):
     0031d0: 09CC           MOVA R9,R12
     0031d2: 1380 35B4      CALLA &0x035b4

    SetPins:
     00356a: 4C2F           MOV.W @R12,R15
     00356c: 4F2E           MOV.W @R15,R14
     00356e: 4E1E 0004      MOV.W 0x0004(R14),R14
     003572: EFDE 0002 0000 XOR.B 0x0002(R15),0x0000(R14)

    But I wanted:

    Porta->SetPins(Led2):
    SetPins:

    003572: EFDE 0002 0000 XOR.B 0x0002(R15),0x0000(R14)


    So, as I forced every part of the code to be "const", I hoped the compiler could predict the behaviour of the calls to Port->SetPins(LedX) and simplify those calls to a single XOR, without the overhead of pointer indirection and function call.

    The volatile says to the compiler that it can't simplify the register access, what is perfectly OK. But that shouldn't prohibit it to optimize such predictable function calls and predictable indirections.

    My question is: Am I missing something, in the way I could get the optimization I want, or it is an issue with the compiler?

    About my design: my final objective is not to toggle LEDs, but I want to abstract the complete hardware in an efficient way. I have already an abstraction layer, and I am refactoring it. And if the compiler can't optimize my code, it will frustrate me, because for my organization, it is really interesting to have such an abstraction layer. We haven't looked forward an operational system, but any suggestion would be also very appreciated.

  • Bruno Schneider said:

    Is there any other thing I should do to guarantee optimization?

    Yes. If you need speed, code your project directly in assembler, without using compilers. If you are know what are you doing, optimization is guaranteed.

  • OK zrno soli, I understand. I think my question was not carefully written. It should be:

    "Is there any other thing I should do in my C code to guarantee optimization?"

    And maybe I can ask: Is there any optimization option that I should try? As I've said before, I've tried some options but didn't get the expected result.

  • Bruno Schneider said:
    So, as I forced every part of the code to be "const", I hoped the compiler could predict the behaviour of the calls to Port->SetPins(LedX) and simplify those calls to a single XOR, without the overhead of pointer indirection and function call.

    Well.. Thing is that compiler does not know - other parts of code (other modules) will be calling particular function or not. So it can't omit, "optimize" any function unless you explicitly tell it to do so using inline directive:

    http://www.greenend.org.uk/rjk/tech/inline.html

    Bruno Schneider said:
    but I want to abstract the complete hardware in an efficient way

    Effective way? You are putting addresses of registers into memory (structure), then referencing those registers indirectly. Why? I do not see any point of doing that in real world application because I/O port pins are connected where they are, connections usually do not change so same code/functions rarely will access port2 and port5 registers. Then why use indirect port register addressing at all? Here I completely agree to old_cow: to abstract register access, use defines. One solution would be to define "pseudo functions". Change those:

    void _SetPins(const MspPin * const pin) {
    	*(pin->pRegs->out) ^= pin->bitmask;
    }
    void SetPins(const Pin * const args) {
    	_SetPins(args->hiddenPin);
    }
    
    to something like this:
    #define _SetPins(a) (*((a)->pRegs->out) ^= (a)->bitmask)
    #define SetPins(a) _SetPins((a)->hiddenPin)

    This hopefully will give result closer to what you expect.

    One more consideration: offset of port registers are the same for all GPIO ports, (&P1DIR - &P1SEL) == (&P2DIR - &P2SEL). Check datasheet of your part.

  • Ilmars said:
    Well.. Thing is that compiler does not know - other parts of code (other modules) will be calling particular function or not. So it can't omit, "optimize" any function unless you explicitly tell it to do so using inline directive:

    http://www.greenend.org.uk/rjk/tech/inline.html

    I tried inlining all the functions (also with __inline), but the compiler yet generates the function calls. I tried then declaring everything "static" to tell the compiler they will not be called from outside. But no effect.

    After reading the paper on the link, maybe I found an answer... Maybe the compiler can't inline the functions because I'm using references to them when I define the API. What do you think of it?

    Ilmars said:
    You are putting addresses of registers into memory (structure), then referencing those registers indirectly. Why?

    I didn't wanted to explain my code, because the point of my question is about why the compiler is not optimizing my code as expected (and maybe it is not optimizing as expected because I'm expecting something absurd from the compiler, but, if so, please tell me). But, to clarify the quoted question, I'm doing this because the call to the API function "Porta->SetPins(&p1)" needs to know what really it is going to be set (actually, toggle, sorry for this). With this structure, I need to code only one function, and not a function for each pin to be toggled (toggling is just a short example, is just one line of code, but think on maintaining repeated definitions of big functions. To maintain lots of small similar functions would be not good also.).

    Ilmars said:
    One solution would be to define "pseudo functions".

    I thought that if I declared a function "inline" it would behave the same as with #define. But as the compiler is not inlining my functions, I shall be wrong. I will try this, although I don't like much to use preprocessor macros; I think there are more polite alternatives in C, like const and inline.

    This question yet intrigues me. I'll be out on the weekend, so maybe I'll read the forum again just by sunday night.

    Thanks for the nice comments, and have a nice weekend!

  • You shall check compiler documentation. I found following:

    Functions can be inlined at --opt_level=3 if the function body is visible in the same module or if -pm is also used and the function is visible in one of the modules being compiled. Functions may be inlined at link time if the file containing the definition and the call site were both compiled with --opt_level=4.

  • Bruno Schneider said:
    "Is there any other thing I should do in my C code to guarantee optimization?"

    You can't guarantee optimization.Optimization means there is something found to be optimized. if nothing is found that can be optimized, no optimization will take place.

    You can only prohibit optimization.

    If you don't want to prohibit optimization, you must not place any optimization blockers. Optimization blockers are inline assembly, use of volatile variables (optimization cannot cross the usage of a volatile) or calling functions.
    How much each one block optimization depends on the actual compiler code and may vary even between minor compiler revisions.

  • Hi!

    Please check that you are not enabling options used to select symbolic debugging or profiling, beause you are using MSP430 CGT v4.1.x.

    From MSP430 Optimizing C/C++ Compiler v4.1 User's Guide (SLAU132G):

    --symdebug:dwarf Generates directives that are used by the C/C++ source-level Debugger and enables assembly source debugging in the assembler. The --symdebug:dwarf option's short form is -g. The --symdebug:dwarf Option disables many code generator optimizations, because they disrupt the debugger. You can use the --symdebug:dwarf option with the --opt_level (aliased as -O) option to maximize the amount of optimization that is compatible with debugging.

    Using -g generally lowers the default optimization level and you could always get yet more optimization by removing the debug switch -g.

    Starting with MSP430 CGT v4.2.x compiler optimization and generation of debug information are now fully independent. There is a wiki worth reading titled Debug versus Optimization Trade-off.

    But you should always have in mind that even optimizer can make mistakes and enabling optimization may cause defects and incorrect code. There are also known issues concerning the optimizer:

    From MSP430 CGT v4.1.7 DefectHistory.txt:

    ------------------------------------------------------------------------------
    FIXED  SDSCM00045550
    ------------------------------------------------------------------------------
    Summary            : Truncated pointer created by cast from integer constant
    Fixed in           : 4.1.5
    Severity           : S2 - Major
    Affected Component : Optimizer

    From MSP430 CGT v4.2.2 DefectHistory.txt:

    ------------------------------------------------------------------------------
    KNOWN ISSUE  SDSCM00045550
    ------------------------------------------------------------------------------

    Summary            : Compiler generates incorrect code with -o3/-o4 and
                         --opt_for_speed=4
    Affected Component : Optimizer

    ------------------------------------------------------------------------------
    KNOWN ISSUE  SDSCM00047272
    ------------------------------------------------------------------------------
    Summary            : Compiler optimization issue
    Affected Component : C/C++ Compiler (cl)

    Description: The MSP432 compiler has an issue about optimization level.

    Optimizer issues have already took me a few nights although my C code has been correctly. So be careful using it.

    Best regards,
    Christian

  • Christian Steffen said:
    But you should always have in mind that even optimizer can make mistakes and enabling optimization may cause defects and incorrect code.

    Well, the optimizer may have bugs, as all code. But assuming there is no bug, the optimizer won't cause defects or cause incorrect code. But it may break invalid assumptions of the coder.
    Like the common assumption that a for loop can be used for a delay. The debugger ensures that optimizations do not change the code behavior regarding the C language specifications (like the fact that after a for loop, the loop variable has a certain value), but not regarding side-effects of specific implementations (like the fact that looping through an empty for loop takes time)

  • I'm very grate for all your answers, which gave me very helpful guidelines!

    I'm moving the topic to the Compiler Forum, which I think suits best my problem.

    For who want to keep up with this question, it's now in  http://e2e.ti.com/support/development_tools/compiler/f/343/t/296427.aspx

**Attention** This is a public forum