This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

AM1808 not delivering expected performance

Other Parts Discussed in Thread: AM1808

UPDATE: Ported Dhrtystone to run bare metal based on an example.  With optimizations and icache off I got 10.2 DMIPS executing out of VRAM with the Arm clocked at 300MHz.  Enabling iCache improved performance to 14.4 DMIPS.  Enabling optimizations (gcc's -O2) bumped it up to 41.7 DMIPS.  All of this is a far cry from the 1.1 MIPS/MHz expected from an ARM9ej-s core.  I've attached the drystone source files in case that helps..

/* 
 **************************************************************************** 
 * 
 *                   "DHRYSTONE" Benchmark Program 
 *                   ----------------------------- 
 *                                                                             
 *  Version:    C, Version 2.1 
 *                                                                             
 *  File:       dhry_2.c (part 3 of 3) 
 * 
 *  Date:       May 25, 1988 
 * 
 *  Author:     Reinhold P. Weicker 
 * 
 **************************************************************************** 
 */  
  
#include "dhry.h"   

  
#ifndef REG   
#define REG   
        /* REG becomes defined as empty */  
        /* i.e. no register variables   */  
#endif   
  
extern  int     Int_Glob;  
extern  char    Ch_1_Glob;  
  
  
__attribute__((section(".vram.text"))) Proc_6 (Enum_Val_Par, Enum_Ref_Par)  
/*********************************/  
    /* executed once */  
    /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */  
  
Enumeration  Enum_Val_Par;  
Enumeration *Enum_Ref_Par;  
{  
  *Enum_Ref_Par = Enum_Val_Par;  
  if (! Func_3 (Enum_Val_Par))  
    /* then, not executed */  
    *Enum_Ref_Par = Ident_4;  
  switch (Enum_Val_Par)  
  {  
    case Ident_1:   
      *Enum_Ref_Par = Ident_1;  
      break;  
    case Ident_2:   
      if (Int_Glob > 100)  
        /* then */  
      *Enum_Ref_Par = Ident_1;  
      else *Enum_Ref_Par = Ident_4;  
      break;  
    case Ident_3: /* executed */  
      *Enum_Ref_Par = Ident_2;  
      break;  
    case Ident_4: break;  
    case Ident_5:   
      *Enum_Ref_Par = Ident_3;  
      break;  
  } /* switch */  
} /* Proc_6 */  
  
  
__attribute__((section(".vram.text"))) Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref)  
/**********************************************/  
    /* executed three times                                      */   
    /* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */  
    /*                  Int_Par_Ref becomes 7                    */  
    /* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */  
    /*                  Int_Par_Ref becomes 17                   */  
    /* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */  
    /*                  Int_Par_Ref becomes 18                   */  
One_Fifty       Int_1_Par_Val;  
One_Fifty       Int_2_Par_Val;  
One_Fifty      *Int_Par_Ref;  
{  
  One_Fifty Int_Loc;  
  
  Int_Loc = Int_1_Par_Val + 2;  
  *Int_Par_Ref = Int_2_Par_Val + Int_Loc;  
} /* Proc_7 */  
  
  
__attribute__((section(".vram.text"))) Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)  
/*********************************************************************/  
    /* executed once      */  
    /* Int_Par_Val_1 == 3 */  
    /* Int_Par_Val_2 == 7 */  
Arr_1_Dim       Arr_1_Par_Ref;  
Arr_2_Dim       Arr_2_Par_Ref;  
int             Int_1_Par_Val;  
int             Int_2_Par_Val;  
{  
  REG One_Fifty Int_Index;  
  REG One_Fifty Int_Loc;  
  
  Int_Loc = Int_1_Par_Val + 5;  
  Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;  
  Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];  
  Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;  
  for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)  
    Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;  
  Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;  
  Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];  
  Int_Glob = 5;  
} /* Proc_8 */  
  
  
Enumeration __attribute__((section(".vram.text"))) Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)  
/*************************************************/  
    /* executed three times                                         */  
    /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */  
    /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */  
    /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */  
  
Capital_Letter   Ch_1_Par_Val;  
Capital_Letter   Ch_2_Par_Val;  
{  
  Capital_Letter        Ch_1_Loc;  
  Capital_Letter        Ch_2_Loc;  
  
  Ch_1_Loc = Ch_1_Par_Val;  
  Ch_2_Loc = Ch_1_Loc;  
  if (Ch_2_Loc != Ch_2_Par_Val)  
    /* then, executed */  
    return (Ident_1);  
  else  /* not executed */  
  {  
    Ch_1_Glob = Ch_1_Loc;  
    return (Ident_2);  
   }  
} /* Func_1 */  
  
  
Boolean __attribute__((section(".vram.text"))) Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)  
/*************************************************/  
    /* executed once */  
    /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */  
    /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */  
  
Str_30  Str_1_Par_Ref;  
Str_30  Str_2_Par_Ref;  
{
  REG One_Thirty        Int_Loc;  
      Capital_Letter    Ch_Loc;  
  
  Int_Loc = 2;  
  while (Int_Loc <= 2) /* loop body executed once */  
    if (Func_1 (Str_1_Par_Ref[Int_Loc],  
                Str_2_Par_Ref[Int_Loc+1]) == Ident_1)  
      /* then, executed */  
    {  
      Ch_Loc = 'A';  
      Int_Loc += 1;  
    } /* if, while */  
  if (Ch_Loc >= 'W' && Ch_Loc < 'Z')  
    /* then, not executed */  
    Int_Loc = 7;  
  if (Ch_Loc == 'R')  
    /* then, not executed */  
    return (true);  
  else /* executed */  
  {  
    if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)  
      /* then, not executed */  
    {  
      Int_Loc += 7;  
      Int_Glob = Int_Loc;  
      return (true);  
    }  
    else /* executed */  
      return (false);  
  } /* if Ch_Loc */  
} /* Func_2 */  
  
  
Boolean __attribute__((section(".vram.text"))) Func_3 (Enum_Par_Val)  
/***************************/  
    /* executed once        */  
    /* Enum_Par_Val == Ident_3 */  
Enumeration Enum_Par_Val;  
{  
  Enumeration Enum_Loc;  
  
  Enum_Loc = Enum_Par_Val;  
  if (Enum_Loc == Ident_3)  
    /* then, executed */  
    return (true);  
  else /* not executed */  
    return (false);  
} /* Func_3 */  
/*  
 ****************************************************************************  
 *  
 *                   "DHRYSTONE" Benchmark Program  
 *                   -----------------------------  
 *                                                                              
 *  Version:    C, Version 2.1  
 *                                                                              
 *  File:       dhry_1.c (part 2 of 3)  
 *  
 *  Date:       May 25, 1988  
 *  
 *  Author:     Reinhold P. Weicker  
 *  
 ****************************************************************************  
 */   
    
#define NUMBER_OF_RUNS  1000000    
   
#include "dhry.h"    

/* BEGIN PTX INCLUDES */
#include "cp15.h"
#include "cpu.h"
#include "gpio.h"
#include "hw_syscfg0_AM1808.h"
#include "hw_tmr.h"
#include "hw_types.h"
#include "hw_uart.h"
#include "psc.h"
#include "pinmux.h"
#include "soc_AM1808.h"
#include "uart.h"
#include "uartStdio.h"

void UARTPrintf(const char *msg, ...);
#define printf UARTPrintf
/* END PTX INCLUDES */
   
/* Global Variables: */   
   
Rec_Pointer     __attribute__((section(".iram.data"))) Ptr_Glob,   
                __attribute__((section(".iram.data"))) Next_Ptr_Glob;   
int             __attribute__((section(".iram.data"))) Int_Glob;   
Boolean         __attribute__((section(".iram.data"))) Bool_Glob;   
char            __attribute__((section(".iram.data"))) Ch_1_Glob,   
                __attribute__((section(".iram.data"))) Ch_2_Glob;   
int             __attribute__((section(".iram.data"))) Arr_1_Glob [50];   
int             __attribute__((section(".iram.data"))) Arr_2_Glob [50] [50];   
   
extern void     *malloc ();   
Enumeration     Func_1 ();   
/* forward declaration necessary since Enumeration may not simply be int */   
     
#ifndef REG    
        Boolean __attribute__((section(".iram.data"))) Reg = false;   
#define REG    
        /* REG becomes defined as empty */   
        /* i.e. no register variables   */   
#else    
        Boolean Reg = true;   
#endif    
   
/* variables for time measurement: */   
   
extern float GettTimerSec();    
   
float           __attribute__((section(".iram.data"))) Begin_Time,   
                __attribute__((section(".iram.data"))) End_Time,   
                __attribute__((section(".iram.data"))) User_Time;   
float           __attribute__((section(".iram.data"))) Microseconds,   
                __attribute__((section(".iram.data"))) Dhrystones_Per_Second;   

/* BEGIN PTX TIMER VARS */
int __attribute__((section(".iram.data"))) nStart = 0;
int __attribute__((section(".iram.data"))) nStop = 0;
/* END PTX TIMER VARS */
   
/* end of variables for time measurement */   
   
__attribute__((section(".vram.text"))) do_dhry ()   
/*****/   
   
  /* main program, corresponds to procedures        */   
  /* Main and Proc_0 in the Ada version             */   
{   
        One_Fifty       Int_1_Loc;   
  REG   One_Fifty       Int_2_Loc;   
        One_Fifty       Int_3_Loc;   
  REG   char            Ch_Index;   
        Enumeration     Enum_Loc;   
        Str_30          Str_1_Loc;   
        Str_30          Str_2_Loc;   
  REG   int             Run_Index;   
  REG   int             Number_Of_Runs;   
   
  /* Initializations */   
     
// Add your UART initializing code here    
   
  Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));   
  Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));   
   
  Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;   
  Ptr_Glob->Discr                       = Ident_1;   
  Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;   
  Ptr_Glob->variant.var_1.Int_Comp      = 40;   
  strcpy (Ptr_Glob->variant.var_1.Str_Comp,    
          "DHRYSTONE PROGRAM, SOME STRING");   
  strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");   
   
  Arr_2_Glob [8][7] = 10;   
        /* Was missing in published program. Without this statement,    */   
        /* Arr_2_Glob [8][7] would have an undefined value.             */   
        /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */   
        /* overflow may occur for this array element.                   */   
   
  printf ("\n");   
  printf ("Dhrystone Benchmark, Version 2.1 (Language: C)\n");   
  printf ("\n");   
  if (Reg)   
  {   
    printf ("Program compiled with 'register' attribute\n");   
    printf ("\n");   
  }   
  else   
  {   
    printf ("Program compiled without 'register' attribute\n");   
    printf ("\n");   
  }   
  printf ("Please give the number of runs through the benchmark: ");   
   
  Number_Of_Runs = NUMBER_OF_RUNS;   
  printf ("%d\n", Number_Of_Runs);   
   
  printf ("\n");   
   
  printf ("Execution starts, %d runs through Dhrystone\n", Number_Of_Runs);   
   
  /***************/   
  /* Start timer */   
  /***************/   
     
// Add your timer initializing code here    
	/* BEGIN PTX ENABLE TIMER CODE */
	nStart = 0;
	HWREG(SOC_TMR_1_REGS + TMR_TCR) = 0;
	HWREG(SOC_TMR_1_REGS + TMR_TGCR) = 0;
	HWREG(SOC_TMR_1_REGS + TMR_TIM12) = 0;
	HWREG(SOC_TMR_1_REGS + TMR_PRD12) = 0xFFFFFFFF;

	HWREG(SOC_TMR_1_REGS + TMR_TGCR) = 0x05; // 32-bit, TIM12 out of reset
	HWREG(SOC_TMR_1_REGS + TMR_TCR) = 0x80; // Continuous Mode
	/* END PTX ENABLE TIMER CODE */

  Begin_Time = 0.0;   
   
  for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)   
  {   
    Proc_5();   
    Proc_4();   
      /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */   
    Int_1_Loc = 2;   
    Int_2_Loc = 3;   
    strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");   
    Enum_Loc = Ident_2;   
    Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);   
      /* Bool_Glob == 1 */   
    while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */   
    {   
      Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;   
        /* Int_3_Loc == 7 */   
      Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);   
        /* Int_3_Loc == 7 */   
      Int_1_Loc += 1;   
    } /* while */   
      /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */   
    Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);   
      /* Int_Glob == 5 */   
    Proc_1 (Ptr_Glob);   
    for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)   
                             /* loop body executed twice */   
    {   
      if (Enum_Loc == Func_1 (Ch_Index, 'C'))   
          /* then, not executed */   
        {   
        Proc_6 (Ident_1, &Enum_Loc);   
        strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");   
        Int_2_Loc = Run_Index;   
        Int_Glob = Run_Index;   
        }   
    }   
      /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */   
    Int_2_Loc = Int_2_Loc * Int_1_Loc;   
    Int_1_Loc = Int_2_Loc / Int_3_Loc;   
    Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;   
      /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */   
    Proc_2 (&Int_1_Loc);   
      /* Int_1_Loc == 5 */   
   
  } /* loop "for Run_Index" */   
   
  /**************/   
  /* Stop timer */   
  /**************/   
   
// Add your timer stopping code here    
  nStop = HWREG(SOC_TMR_1_REGS + TMR_TIM12);
     
  End_Time = ((float)(nStop - nStart))/24000000.0;//GetTimerSec(); // Get timer value in second    
     
  printf ("Execution ends\n");   
  printf ("\n");   
  printf ("Final values of the variables used in the benchmark:\n");   
  printf ("\n");   
  printf ("Int_Glob:            %d\n", Int_Glob);   
  printf ("        should be:   %d\n", 5);   
  printf ("Bool_Glob:           %d\n", Bool_Glob);   
  printf ("        should be:   %d\n", 1);   
  printf ("Ch_1_Glob:           %c\n", Ch_1_Glob);   
  printf ("        should be:   %c\n", 'A');   
  printf ("Ch_2_Glob:           %c\n", Ch_2_Glob);   
  printf ("        should be:   %c\n", 'B');   
  printf ("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);   
  printf ("        should be:   %d\n", 7);   
  printf ("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);   
  printf ("        should be:   Number_Of_Runs + 10\n");   
  printf ("Ptr_Glob->\n");   
  printf ("  Ptr_Comp:          %d\n", (int) Ptr_Glob->Ptr_Comp);   
  printf ("        should be:   (implementation-dependent)\n");   
  printf ("  Discr:             %d\n", Ptr_Glob->Discr);   
  printf ("        should be:   %d\n", 0);   
  printf ("  Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);   
  printf ("        should be:   %d\n", 2);   
  printf ("  Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);   
  printf ("        should be:   %d\n", 17);   
  printf ("  Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);   
  printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");   
  printf ("Next_Ptr_Glob->\n");   
  printf ("  Ptr_Comp:          %d\n", (int) Next_Ptr_Glob->Ptr_Comp);   
  printf ("        should be:   (implementation-dependent), same as above\n");   
  printf ("  Discr:             %d\n", Next_Ptr_Glob->Discr);   
  printf ("        should be:   %d\n", 0);   
  printf ("  Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);   
  printf ("        should be:   %d\n", 1);   
  printf ("  Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);   
  printf ("        should be:   %d\n", 18);   
  printf ("  Str_Comp:          %s\n",   
                                Next_Ptr_Glob->variant.var_1.Str_Comp);   
  printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");   
  printf ("Int_1_Loc:           %d\n", Int_1_Loc);   
  printf ("        should be:   %d\n", 5);   
  printf ("Int_2_Loc:           %d\n", Int_2_Loc);   
  printf ("        should be:   %d\n", 13);   
  printf ("Int_3_Loc:           %d\n", Int_3_Loc);   
  printf ("        should be:   %d\n", 7);   
  printf ("Enum_Loc:            %d\n", Enum_Loc);   
  printf ("        should be:   %d\n", 1);   
  printf ("Str_1_Loc:           %s\n", Str_1_Loc);   
  printf ("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");   
  printf ("Str_2_Loc:           %s\n", Str_2_Loc);   
  printf ("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");   
  printf ("\n");   
   
  User_Time = End_Time - Begin_Time;   
   
  Microseconds = (float) User_Time * Mic_secs_Per_Second    
                      / (float) Number_Of_Runs;   
  Dhrystones_Per_Second = (float) Number_Of_Runs / (float) User_Time;   
   
  printf ("Microseconds for one run through Dhrystone: ");   
  printf ("%6.1f \n", Microseconds);   
  printf ("Dhrystones per Second:                      ");   
  printf ("%6.1f \n", Dhrystones_Per_Second);   
  printf ("Dhrystones MIPS:                            ");   
  printf ("%6.1f \n", Dhrystones_Per_Second / 1757.0);   
  printf ("\n");   
     
}   
   
   
__attribute__((section(".vram.text"))) Proc_1 (Ptr_Val_Par)   
/******************/   
   
REG Rec_Pointer Ptr_Val_Par;   
    /* executed once */   
{   
  REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;     
                                        /* == Ptr_Glob_Next */   
  /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */   
  /* corresponds to "rename" in Ada, "with" in Pascal           */   
     
  structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);    
  Ptr_Val_Par->variant.var_1.Int_Comp = 5;   
  Next_Record->variant.var_1.Int_Comp    
        = Ptr_Val_Par->variant.var_1.Int_Comp;   
  Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;   
  Proc_3 (&Next_Record->Ptr_Comp);   
    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp   
                        == Ptr_Glob->Ptr_Comp */   
  if (Next_Record->Discr == Ident_1)   
    /* then, executed */   
  {   
    Next_Record->variant.var_1.Int_Comp = 6;   
    Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,    
           &Next_Record->variant.var_1.Enum_Comp);   
    Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;   
    Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,    
           &Next_Record->variant.var_1.Int_Comp);   
  }   
  else /* not executed */   
    structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);   
} /* Proc_1 */   
   
   
__attribute__((section(".vram.text"))) Proc_2 (Int_Par_Ref)   
/******************/   
    /* executed once */   
    /* *Int_Par_Ref == 1, becomes 4 */   
   
One_Fifty   *Int_Par_Ref;   
{   
  One_Fifty  Int_Loc;     
  Enumeration   Enum_Loc;   
   
  Int_Loc = *Int_Par_Ref + 10;   
  do /* executed once */   
    if (Ch_1_Glob == 'A')   
      /* then, executed */   
    {   
      Int_Loc -= 1;   
      *Int_Par_Ref = Int_Loc - Int_Glob;   
      Enum_Loc = Ident_1;   
    } /* if */   
  while (Enum_Loc != Ident_1); /* true */   
} /* Proc_2 */   
   
   
__attribute__((section(".vram.text"))) Proc_3 (Ptr_Ref_Par)   
/******************/   
    /* executed once */   
    /* Ptr_Ref_Par becomes Ptr_Glob */   
   
Rec_Pointer *Ptr_Ref_Par;   
   
{   
  if (Ptr_Glob != Null)   
    /* then, executed */   
    *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;   
  Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);   
} /* Proc_3 */   
   
   
__attribute__((section(".vram.text"))) Proc_4 () /* without parameters */   
/*******/   
    /* executed once */   
{   
  Boolean Bool_Loc;   
   
  Bool_Loc = Ch_1_Glob == 'A';   
  Bool_Glob = Bool_Loc | Bool_Glob;   
  Ch_2_Glob = 'B';   
} /* Proc_4 */   
   
   
__attribute__((section(".vram.text"))) Proc_5 () /* without parameters */   
/*******/   
    /* executed once */   
{   
  Ch_1_Glob = 'A';   
  Bool_Glob = false;   
} /* Proc_5 */   
   
   
        /* Procedure for the assignment of structures,          */   
        /* if the C compiler doesn't support this feature       */   
#ifdef  NOSTRUCTASSIGN    
memcpy (d, s, l)   
register char   *d;   
register char   *s;   
register int    l;   
{   
        while (l--) *d++ = *s++;   
}   
#endif    
0243.dhry.h

I was doing some performance testing on an important interrupt and was seeing things take about 6x the time they should.  I had replaced an AVR32 running at 48MHz with the AM1808 running at 300MHz.  Strangely the time to service our most critical interrupt time was identical  (~3.3uS).

Looking into this further I discovered that executing 300 nops was taking about 6uS.

I tried the following with an IO toggle in place of the timer register reads as well and the scope agreed with the timer... it really seems to take 6uS to execute 300 instructions... 

The AM1808 is running @ 300MHz and the code was executing out of the vector ram (0xFFFF0000 - 0xFFFF2000).  Compiler optimizations and instruction caching were both disabled.

Any suggestions as to where I've gone wrong?  I'd really like to understand why the performance of this processor @ 300MHz is so poor.

nStart = HWREG(SOC_TMR_1_REGS + TMR_TIM12);

asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
nStop = HWREG(SOC_TMR_1_REGS + TMR_TIM12);