This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

Speed optimization for CCS for Stellaris

Other Parts Discussed in Thread: TM4C123GH6PM

Hello,

I need to optimize the speed for CCS for stellaris LM4F120H5QR. I set the CPU clock to 80 MHz but I get only 2.10MHz throughput for the following code but then I use keil compiler for the same code I get more throughput. Is there anyway to increase the speed of operation?

#include "inc/hw_types.h"
#include "inc/hw_memmap.h"
#include "driverlib/sysctl.h"
#include "driverlib/gpio.h"

int main(void)
{

	SysCtlClockSet(SYSCTL_SYSDIV_2_5|SYSCTL_USE_PLL|SYSCTL_XTAL_16MHZ|SYSCTL_OSC_MAIN);

	SysCtlPeripheralEnable(SYSCTL_PERIPH_GPIOF);
	GPIOPinTypeGPIOOutput(GPIO_PORTF_BASE, GPIO_PIN_1|GPIO_PIN_2|GPIO_PIN_3);

	while(1)
	{
		// Turn on the LED
		GPIOPinWrite(GPIO_PORTF_BASE, GPIO_PIN_2, 4);
		GPIOPinWrite(GPIO_PORTF_BASE, GPIO_PIN_2, 0);

	}
}

  • Hello Dhiman

    What is the expectation that you have from the toggle rate optimization?

    Regards
    Amit
  • Hello Amit

    I am just checking the machine cycle using GPIO read write API for the toggle rate, I have a code segment as below I need to complete the while loop within 7 uS so I'm trying to optimize the individual line as possible. Currently it is taken 12uS to complete the while(1) loop with -me -O3 --opt_for_speed=5 optimization setting. please suggest whether speed optimization is possible. 

    #include <stdint.h>
    #include <stdbool.h>
    #include "inc/hw_types.h"
    #include "inc/hw_memmap.h"
    #include "inc/hw_gpio.h"
    #include "inc/hw_adc.h"
    
    #include "driverlib/systick.h"
    #include "driverlib/interrupt.h"
    #include "driverlib/sysctl.h"
    #include "driverlib/pin_map.h"
    #include "driverlib/rom_map.h"
    #include "driverlib/gpio.h"
    #include "driverlib/adc.h"
    
    #ifdef DEBUG
    void
    __error__(char *pcFilename, unsigned long ulLine)
    {
    }
    #endif
    
    
    void SysTickIntHandler(void)
    {
    	GPIOPinWrite(GPIO_PORTF_BASE, GPIO_PIN_2, 4);
    //	SysCtlDelay(1000);
    	GPIOPinWrite(GPIO_PORTF_BASE, GPIO_PIN_2, 0);
    
    }
    
    #define SYSTICKS_PER_SECOND 100000	//100k
    
    
    #define THRESHOLD 0
    int main(void)
    {
    	unsigned long ul_V_ADC_0=0;
    	unsigned long ul_I_ADC_1=0;
    
    	unsigned long ul_v_1=0,ul_v_2=0;
    	unsigned long ul_p_1=0,ul_p_2=0;
    	unsigned long ul_i_1,ul_i_2=0;
    	int uc_x_v=0,uc_x_p=0,uc_w=0;
    	SysCtlClockSet(SYSCTL_SYSDIV_2_5|SYSCTL_USE_PLL|SYSCTL_OSC_MAIN|SYSCTL_XTAL_16MHZ);	// 40MHz
    
    	//
    	// Enable Peripheral Clocks
    	//
    	MAP_SysCtlPeripheralEnable(SYSCTL_PERIPH_ADC0);
    	MAP_SysCtlPeripheralEnable(SYSCTL_PERIPH_ADC1);
    	MAP_SysCtlPeripheralEnable(SYSCTL_PERIPH_GPIOE);
    	SysCtlPeripheralEnable(SYSCTL_PERIPH_GPIOF);
    	GPIOPinTypeGPIOOutput(GPIO_PORTF_BASE, GPIO_PIN_1|GPIO_PIN_2|GPIO_PIN_3);
    	//
    	// Enable pin PE2 for ADC AIN1
    	//
    	MAP_GPIOPinTypeADC(GPIO_PORTE_BASE, GPIO_PIN_2);
    
    	//
    	// Enable pin PE3 for ADC AIN0
    	//
    	MAP_GPIOPinTypeADC(GPIO_PORTE_BASE, GPIO_PIN_3);
    
    	SysCtlADCSpeedSet(SYSCTL_ADCSPEED_1MSPS);				// Set Speed for both ADC
    	ADCSequenceDisable(ADC0_BASE, 3);
    	ADCSequenceDisable(ADC1_BASE, 3);
    
    	ADCPhaseDelaySet(ADC0_BASE,ADC_PHASE_0);
    
    	ADCSequenceStepConfigure(ADC0_BASE, 3, 0, ADC_CTL_CH0| ADC_CTL_IE |ADC_CTL_END);	//CH 0 -> V
    	ADCSequenceStepConfigure(ADC1_BASE, 3, 0, ADC_CTL_CH1 | ADC_CTL_IE |ADC_CTL_END);	//CH 1 -> I
    
    	ADCSequenceEnable(ADC0_BASE, 3);
    	ADCSequenceEnable(ADC1_BASE, 3);
    	ADCHardwareOversampleConfigure(ADC0_BASE,2);
    	ADCHardwareOversampleConfigure(ADC1_BASE,2);
    	/* SysTick Setup */
    	SysTickPeriodSet(SysCtlClockGet() / SYSTICKS_PER_SECOND);
    	IntMasterEnable();
    	SysTickIntEnable();
    	SysTickEnable();
    	while(1)
    	{
    
    //		ADCIntClear(ADC0_BASE, 3);
    //		ADCIntClear(ADC1_BASE, 3);
    		ADCProcessorTrigger(ADC1_BASE, (3|ADC_TRIGGER_WAIT));             // Put ADC-1 in Trigger Wait
    		ADCProcessorTrigger(ADC0_BASE, (3|ADC_TRIGGER_SIGNAL));                // Put ADC-0 in Global Trigger
    		while(!ADCIntStatus(ADC0_BASE, 3, false));            // Wait for conversion to be completed.
    		ADCIntClear(ADC0_BASE, 3);                    // Clear the ADC interrupt flag.
    		ADCIntClear(ADC1_BASE, 3);                    // Clear the ADC interrupt flag.
    		ADCSequenceDataGet(ADC0_BASE, 3, &ul_V_ADC_0);            // Read ADC Value.
    		ADCSequenceDataGet(ADC1_BASE, 3, &ul_I_ADC_1);            // Read ADC Value.
    
    		ul_v_1=ul_V_ADC_0>>3;
    		ul_i_1=ul_I_ADC_1>>3;
    		ul_p_1=ul_v_1*ul_i_1;
    
    //		ADCIntClear(ADC0_BASE, 3);
    //		ADCIntClear(ADC1_BASE, 3);
    		ADCProcessorTrigger(ADC1_BASE, (3|ADC_TRIGGER_WAIT));             // Put ADC-1 in Trigger Wait
    		ADCProcessorTrigger(ADC0_BASE, (3|ADC_TRIGGER_SIGNAL));                // Put ADC-0 in Global Trigger
    		while(!ADCIntStatus(ADC0_BASE, 3, false));            // Wait for conversion to be completed.
    		ADCIntClear(ADC0_BASE, 3);                    // Clear the ADC interrupt flag.
    		ADCIntClear(ADC1_BASE, 3);                    // Clear the ADC interrupt flag.
    		ADCSequenceDataGet(ADC0_BASE, 3, &ul_V_ADC_0);            // Read ADC Value.
    		ADCSequenceDataGet(ADC1_BASE, 3, &ul_I_ADC_1);            // Read ADC Value.
    
    		ul_v_2=ul_V_ADC_0>>3;
    		ul_i_2=ul_I_ADC_1>>3;
    		ul_p_2=ul_v_2*ul_i_2;
    
    		if(((ul_p_2+THRESHOLD)-ul_p_1)>0)
    		{
    			uc_x_p=1;
    		}
    		else
    		{
    			uc_x_p=0;
    		}
    
    		if(((ul_v_2+THRESHOLD)-ul_v_1)>0)
    		{
    			uc_x_v=1;
    		}
    		else
    		{
    			uc_x_v=0;
    		}
    		uc_w=uc_x_p^uc_x_v;
    		if(uc_w==1)		
    		{
    			uc_w=2;
    		}
    		else
    		{
    			uc_w=0;
    		}
    		GPIOPinWrite(GPIO_PORTF_BASE, GPIO_PIN_1, uc_w);
    	}
    }
  • Hello Dhiman

    At 80MHz the prefetch buffer is taken, which adds a delay of 1 clock if the instruction/data fetch is more than 4 bytes away. So at 80MHz the maximum toggle rate is what the DRM can achieve.

    Note that the while loop requires a conversion and compute. The ADC shall take 1us alone to perform a conversion. With two conversions this is already 2us of the 12us. This is then followed by polling and data read from the ADC and subsequently compute. I am not surprised by the value of 12us.

    Regards
    Amit
  • Hi Amit

    I understand there is a polling delay my concern is that is there anyway to optimize the speed in this code segment..

    1.Should I configure the adc to another mode?

    2.or is that optimum mode of using this adc?

    3.or should I optimize this project using optimization setting of compiler?

    4.or sould I use inline assembly code?

  • Hello Dhiman

    The target is 7 us out of which 2 us is for the ADC conversion. That leaves 5 us for data read from ADC and perform computation. I would suggest using inline assembly.

    EDIT: I am not sure why it is still taking ~10us to perform the operations. So would like to revisit the code when I am in office.

    Regards
    Amit

  • Might I suggest you re-evaluate your state goal and algorithm?

    I.E.  Figure out what are you trying to accomplish. It looks to me as if you are looking to respond to exceeding a threshold quickly. You could take a look at the threshold checking in the A/D module and se if that meets your needs. If not then another approach is more likely to be fruitful.  

    Something like

    • Run A/D in an interrupt with conversions started by a timer
    • on completion of conversion you get an interrupt and in that interrupt you
      • store the result
      • check and set the threshold flag, you can
        • trigger another interrupt for immediate response, or
        • set  a semaphore in your RTOS or
        • just set a variable

    There are other ways to do this, but the basic idea is to change the processing from a serial conversion and evaluate process to a evaluate during conversion process.

    Robert

  • Hello Robert

    I just want to calculate power from two sample then difference in power and voltage depending on slope i am taking some logical decision. Should RTOS speed up the 'while loop' section?
  • Then you definitely need to sample on a timed basis. Speed is a secondary concern to fixed, predictable timing.

    An RTOS doesn't speed up raw processing, it organises processing so it gets done on time.

    Robert
  • Thanks Robert, thanks Amit
  • Dhiman Das said:
    I understand there is a polling delay my concern is that is there anyway to optimize the speed in this code segment..

    Some thoughts on how to speed up the code:

    1) Set the compiler optimization level to "4 Whole Program Optimizations" and "Speed .vs. size trade-offs" to 5 (speed), and add the driverlib source files to the project as linked files rather than linking to the pre-compiled driverlib library. That allows the optimizer to in-line the driverlib functions and thus avoid the function all overhead (as well as potentially other optimizations).

    2) Execute from SRAM rather than flash, to avoid the flash wait-states. 

  • Hello Chester

    SRAM will incur single cycle wait state. May be API calls from ROM which are zero wait states.

    Regards
    Amit
  • Amit Ashara said:
    SRAM will incur single cycle wait state.

    Thank you for pointing out my mistake, I was getting confused with other microcontrollers.

    Amit Ashara said:
    May be API calls from ROM which are zero wait states.

    To investigate, timed different variations of a simple while loop using GPIOPinWrite() to toggle a GPIO pin. For all combinations:

    - Used TI ARM compiler v5.2.6

    - Optimization options of -O4 --opt_for_speed=5

    - A TM4C123GH6PM running at 80 MHz]

    - TivaWare v2.1.1.71

    - A LWLA1016 running in Frequency Counter mode to measure the toggle speed (not sure of the accuracy of the frequency measurement, but should be valid for relative comparisons)

    The toggle frequency achieved for the different combinations are:

    1) 2.499975 MHz for "flash_library", with the code running in flash linking to the pre-compiled TivaWare driverlib.lib.

    2) 2.222200 MHz for "ram_library", with the code running in SRAM linking to the pre-compiled TivaWare driverlib.lib.

    3) 2.105242 MHz for "rom", with the main function running in flash calling TivaWare ROM functions.

    4) 15.999844 MHz for "flash_inlined", with the code running in flash and all TivaWare driverlib sources files compiled as linked-resources in the project to allow the optimizer to inline the TivaWare functions.

    5) 13.333202 MHz for "ram_inlined", with the code running in SRAM and all TivaWare driverlib sources files compiled as linked-resources in the project to allow the optimizer to inline the TivaWare functions.

    The projects are attached tm4c_optimisations.zip

    From this simplistic test of just toggling a GPIO pin:

    a) Code running in SRAM is slower than code running in flash.

    b) Calling ROM functions is slower than calling TivaWare functions in flash. While the ROM is zero-wait state, I am not sure of the optimization level the ROM code was compiled with.

    c) Allowing the optimizer to inline the TivaWare functions achieved a speed-up factor of 6.4 times.

    Without testing on the original posters more complex code sequence, not sure if inlining would also achieve the same speed-up factor or not.

  • Chester Gillon said:
    SRAM will incur single cycle wait state.

  • cb1_mobile said:
    The "level of detail" is to be especially noted.   Thank you.

    Thank you for all your contributions to helping forum users as well. You have a better knowledge of hardware.

    cb1_mobile said:
    One minor suggestion - my firm has noted that - in general - GPIO pins which are "lesser function switching/capable" have achieved the highest toggle rates!   (this - possibly - due to fewer {and shorter} paths thru the MCU's interconnect maze)   Changing to such a "limited" pin may prove insightful...

    The TM4C123 and Stellaris LM4F parts have GPIOs on both the Advanced High-Performance Bus (AHB) bus and Advanced Peripheral Bus (APB). The code I used for the test accessed GPIOF on the APB bus. Whereas accessing GPIOF on the AHB bus would in theory allow faster toggle times. A further test would be required to measure the performance impact.

  • Chester Gillon said:
    accessing GPIOF on the AHB bus would in theory allow faster toggle times

    Indeed - and that's well known.

    But not my (major) point!    Many/most of the GPIO pins enjoy a complex, interconnect matrix (potentially enabling such pins to provide multiple - and different - alternate functions.)   As a past IC designer - I noted that those pins w/the most limited interconnect matrix - most often realized the highest toggle rate.   It is thus required that the (speed seeking user) "comb" the MCU manual's GPIO section - searching for those GPIO with the minumum number of (alternate) functions.  (and trivially - access to the AHB bus) 

    This was my major point - and not (generally) nor (well) known.   AHB Bus (beyond theory) will enable faster GPIO bit toggles (and is well known)...

  • Hello cb1

    Yes, sir. I almost forgot that there is both AHB and APB bus apertures.

    Regards
    Amit