AUDIO-AM62D-EVM: Difference in execution clock cycles between AM62D and AM62A, and C7x

Part Number: AUDIO-AM62D-EVM
Other Parts Discussed in Thread: SK-AM62A-LP,

Tool/software:

Hi experts,

Based on the original thread, I set both the AM62A-LP and AM62D-EVM to 500MHz and 1000MHz and ran the same measurement app.
As a result, we found that the measured number of execution clock cycles differed even when running the same process on the same EVM.
Also, despite assuming that the c7x performance of the AM62A and AM62D is the same, the results differ between SK-AM62A-LP and AM62D-EVM.

Q1: Could you please explain why the number of execution clock cycles differs even when performing the same process on the same evaluation board?
Our assumption was that the number of measurable clock cycles would be the same for the same process, even if the startup frequency was different.

Q2: Can you please explain why there is a difference between SK-AM62A-LP and AM62D-EVM even when using the same SDK binary?

We perform a simple data copy process as shown below and measure the count value using the CCS Profile clock.

#include "copy_buffer.h"

double4 gd_src_lch_d4[SAMPLE_NUM_QUATER];
double4 gd_src_rch_d4[SAMPLE_NUM_QUATER];
double4 gd_dst_lch_d4[SAMPLE_NUM_QUATER];
double4 gd_dst_rch_d4[SAMPLE_NUM_QUATER];

#pragma DATA_SECTION(".sdram");
float sdram_buffer01[768 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer02[768 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer03[768 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer04[3 * 1024 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer05[4][4000 * 96];
#pragma DATA_SECTION(".sdram");
float sdram_buffer06[3 * 1024 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer07[2][12][4096];
#pragma DATA_SECTION(".sdram");
float sdram_buffer08[9600];
#pragma DATA_SECTION(".sdram");
float sdram_buffer09[9600];
#pragma DATA_SECTION(".sdram");
float sdram_buffer10[2][8][32768];
#pragma DATA_SECTION(".sdram");
float sdram_buffer11[4096];
#pragma DATA_SECTION(".sdram");
float sdram_buffer12[4096];
#pragma DATA_SECTION(".sdram");
float sdram_buffer13[768 * 1024];
#pragma DATA_SECTION(".sdram");
float sdram_buffer14[768 * 1024];

void copy_float( float *restrict src_lch, float *restrict src_rch, float *restrict dst_lch, float *restrict dst_rch )
{
	int num;

	ALIGNED_ARRAY(src_lch);
	ALIGNED_ARRAY(src_rch);
	ALIGNED_ARRAY(dst_lch);
	ALIGNED_ARRAY(dst_rch);

	for(num=0; num<SAMPLE_NUM; num++) {
		dst_lch[num] = src_lch[num];
		dst_rch[num] = src_rch[num];
	}
}

void copy_float_size( float *restrict src_lch, float *restrict src_rch, float *restrict dst_lch, float *restrict dst_rch, int size )
{
	int num;

	ALIGNED_ARRAY(src_lch);
	ALIGNED_ARRAY(src_rch);
	ALIGNED_ARRAY(dst_lch);
	ALIGNED_ARRAY(dst_rch);

	for(num=0; num < size; num++) {
		dst_lch[num] = src_lch[num];
		dst_rch[num] = src_rch[num];
	}
}

void copy_double( double *restrict src_lch, double *restrict src_rch, double *restrict dst_lch, double *restrict dst_rch )
{
	int num;

	ALIGNED_ARRAY(src_lch);
	ALIGNED_ARRAY(src_rch);
	ALIGNED_ARRAY(dst_lch);
	ALIGNED_ARRAY(dst_rch);

	for(num=0; num<SAMPLE_NUM; num++) {
		dst_lch[num] = src_lch[num];
		dst_rch[num] = src_rch[num];
	}
}

void copy_double4( double4 *restrict src_lch, double4 *restrict src_rch, double4 *restrict dst_lch, double4 *restrict dst_rch )
{
	int num;

	ALIGNED_ARRAY(src_lch);
	ALIGNED_ARRAY(src_rch);
	ALIGNED_ARRAY(dst_lch);
	ALIGNED_ARRAY(dst_rch);

	for(num=0; num<SAMPLE_NUM_QUATER; num++) {
		dst_lch[num] = src_lch[num];
		dst_rch[num] = src_rch[num];
	}
}

void test_func01( void )
{
	int i;
	for(i=0; i < 100; i++)
	{
		copy_double(gd_src_lch, gd_src_rch, gd_dst_lch, gd_dst_rch);
	}
}

void test_func02( void )
{
	int i;
	for(i=0; i < 100; i++)
	{
		copy_double4(gd_src_lch_d4, gd_src_rch_d4, gd_dst_lch_d4, gd_dst_rch_d4);
	}
}

void test_func03( void )
{
	int i;
	int size = sizeof(sdram_buffer04)/sizeof(float);

	for(i=0; i < 10; i++)
	{
		copy_float_size(sdram_buffer04, sdram_buffer04, sdram_buffer06, sdram_buffer06, size);
	}
}

void test_func04( void )
{
	int i;
	int size = sizeof(sdram_buffer01)/sizeof(float);

	for(i=0; i < 10; i++)
	{
		copy_float_size(sdram_buffer01, sdram_buffer02, sdram_buffer04, sdram_buffer04 + size, size);
		copy_float_size(sdram_buffer03, sdram_buffer13, sdram_buffer06, sdram_buffer06 + size, size);
	}
}

mcasp_playback_custom02.zip

Based on am62ax_mcu_plus_sdk_10_00_00_14. The results were the same with v11.1. The difference between the EVMs was 1.15x for test_func03 and 1.06x for test_func04 when C7x=1000MHz.

 func03 SK-AM62A-LP AUDIO-AM62D-EVM
1000MHz 368680012 425624480
500MHz 206590725 225976894
 func04 SK-AM62A-LP AUDIO-AM62D-EVM
1000MHz 461088870 488769638
500MHz 250512996 261783175

Best regards,
O.H

  • Hi O.H,
    Thank you for creating separate thread. I am trying to test the same on my setup, will get back with my findings.


    Thanks,
    Shreyansh

  • Hi Shreyansh,

    Thank you for your support. The following is additional information.

    Q3: Why does the number of cycles differ depending on the SDK even when performing the same process?

    Details are below.

    Differences in the number of cycles occurred due to differences in the SDK. The number of cycles is the same on both EVMs.

    SDK mcu_plus_sdk_am62ax_10_00_00_14 freertos_sdk_am62dx_10_01_00_36 freertos_sdk_am62dx_11_01_00_16
    CGT ti-cgt-c7000_4.1.1.LTS ti-cgt-c7000_4.1.1.LTS ti-cgt-c7000_5.0.0.LTS
    1000MHz 8931→1242(Stable from the second loop onwards) 15545→3794(Stable from the second loop onwards) 18093→5050(Stable from the second loop onwards)

    The following "test_func05()" was added to the project shared last time.

    #define TEMP_MACRO01	0.00104166666667f
    float g_temp_float01[4]; 
    double gd_temp_buf01[4][SAMPLE_NUM];
    double gd_temp_buf02[4][SAMPLE_NUM];
    double gd_temp_buf03[SAMPLE_NUM];
    double gd_temp_buf04[SAMPLE_NUM];
    unsigned short g_input[4];
    
    void test_func05( void )
    {
    	float val_now;
    	int ch, num;
    
    	g_input[0] = 0;
    	g_input[1] = 0;
    	g_input[2] = 0;
    	g_input[3] = 0;
    
    	for(ch=0; ch<4; ch++) {
    		val_now = 1.0f;
    		if (g_input[ch] == 0) {
    			if (val_now < 1.0f) {
    					for(num=0; num<SAMPLE_NUM; num++) {
    					val_now = val_now + TEMP_MACRO01;
    					if (val_now > 1.0f) {
    						val_now = 1.0f;
    					}
    					gd_temp_buf03[num] += (double)val_now*gd_temp_buf01[ch][num];
    					gd_temp_buf04[num] += (double)val_now*gd_temp_buf02[ch][num];
    				}
    			}
    			else {
    				for(num=0; num<SAMPLE_NUM; num++) {
    					gd_temp_buf03[num] += gd_temp_buf01[ch][num];
    					gd_temp_buf04[num] += gd_temp_buf02[ch][num];
    				}
    			}
    		}
    		else {
    			if (val_now > 0.0f) {
    				for(num=0; num<SAMPLE_NUM; num++) {
    					val_now = val_now - TEMP_MACRO01;
    					if (val_now < 0.0f) {
    						val_now = 0.0f;
    					}
    					gd_temp_buf03[num] += (double)val_now*gd_temp_buf01[ch][num];
    					gd_temp_buf04[num] += (double)val_now*gd_temp_buf02[ch][num];
    				}
    			}
    		}
    		g_temp_float01[ch] = val_now;
    
    	}
    }
    

    Our ultimate goal is to obtain results with AM62D that are equivalent to those of AM62A.

    Best regards,
    O.H

  • Hi O.H,

    Thank you for the sharing the project. I am looking into the differences. 

    Thanks,
    Shreyansh