Disqualified loop: Loop contains control code

Francisco Igual

Hello,

I'm trying to optimize a small loop, but the compiler (version 7.3.1) is complaining about

a non qualified loop. I would like to understand why is this happening, if anyone can help. Here

is the loop:

for (i__ = 0; i__ < i__2; i__ += 4) { // Bucle a vectorizar (i__ += 4)
        // Cargamos elementos de a
        a_v    = _dto128( _memd8_const( (void*) ptr_a ),
                          _memd8_const( (void*) (ptr_a + 2)) );

        // Cargamos elementos de x
        x_v    = _dto128( _memd8_const((void*) ptr_x),
                          _memd8_const((void*) (ptr_x + 2) ) );

        // Multiplicamos por temp y acumulamos
        temp_vp = _qmpysp( x_v, temp_v2 );

        temp_a_upper = _daddsp( _hif2_128( temp_vp ), _hif2_128( a_v ) );
        temp_a_lower = _daddsp( _lof2_128( temp_vp ), _lof2_128( a_v ) );

        *ptr_a     = _lof2( temp_a_upper );
        *(ptr_a+1) = _hif2( temp_a_upper );
        *(ptr_a+2) = _lof2( temp_a_lower );
        *(ptr_a+3) = _hif2( temp_a_lower );

        ptr_a += 4;
        ptr_x += 4;
}

And here is the fragment of the .asm file, if it helps:

$C$DW$L$my_sger_$4$E:
;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*      Disqualified loop: Loop contains control code
;*----------------------------------------------------------------------------*
$C$L2:
$C$DW$L$my_sger_$5$B:
           LDNDW   .D1T1   *+A20(8),A5:A4    ; |62|
           LDNDW   .D1T1   *A20,A7:A6        ; |62|
           LDNDW   .D2T2   *+B10(8),B5:B4    ; |57|
           LDNDW   .D2T2   *B10,B9:B8        ; |57|
           ADD     .L1     4,A3,A3           ; |53|
           CMPLT   .L1     A3,A21,A0         ; |53|
           QMPYSP .M1     A7:A6:A5:A4,A11:A10:A9:A8,A7:A6:A5:A4 ; |66|
           NOP             3
           DADDSP .L1X    A7:A6,B9:B8,A7:A6 ; |71|

           DADDSP .L2X    A5:A4,B5:B4,B5:B4 ; |73|
|| [ A0]   ADD     .L1     4,A26,A4
|| [ A0]   B       .S1     $C$L5             ; |53|

   [ A0]   LDW     .D1T1   *A4,A5

   [ A0]   B       .S1     $C$L2
||         STW     .D2T1   A6,*B10           ; |71|

           STW     .D2T2   B5,*+B10(12)      ; |74|
           STW     .D2T2   B4,*+B10(8)       ; |73|

           STW     .D2T1   A7,*+B10(4)       ; |72|
||         ADDK    .S2     16,B10            ; |76|

           ; BRANCHCC OCCURS {$C$L5}         ; |53|

Can anybody help me with this? Thanks.

Fran

over 13 years ago

0 Archaeologist over 13 years ago

TI__Guru* 84285 points

We'll need a more complete test case. Please provide a compilable function which demonstrates the problem. Be sure to include the exact command-line option used.

When I wrap your code in the following function, the compiler is able to software pipeline with the options -o -mv6600:

#include <c6x.h>

void func(double *ptr_a, double *ptr_x, int i__2, __x128_t temp_v2)
{
    int i__;
    __x128_t a_v, x_v, temp_vp; 
    __float2_t temp_a_upper, temp_a_lower;
   [.. your code here..]
}

0 Francisco Igual over 13 years ago in reply to Archaeologist

Prodigy 60 points

Attached is the complete source file. The problematic loop is nested into an outer one.

Thanks,

Fran

Fullscreen 3113.my_sger.c Download

#include "c6x.h"

int my_sger_(int *m, int *n, float *alpha, float * restrict x,
			 int *incx, float * restrict y, int *incy, float * restrict a, int *lda)
{
	/* System generated locals */
	int n_, m_;

	/* Local variables */
	int i, j, jy;

	__x128_t alpha_v, yscaled_v, temp_vp, y_v, a_v, x_v;
	__float2_t temp_a_lower, temp_a_upper;

	float * ptr_a;
	float * ptr_x;

	float value_y;
	float value_alpha = *alpha;

	jy = 0;

	n_ = *n;
	m_ = *m;

	ptr_a = &a[0];

	alpha_v = _fto128( value_alpha, value_alpha, value_alpha, value_alpha );

	for (j = 0; j < n_; j++) {

		value_y   = y[jy++];
		y_v       = _fto128( value_y, value_y, value_y, value_y );
		yscaled_v = _qmpysp( alpha_v, y_v );

		ptr_x = &x[0];

		for (i = 0; i < m_; i += 4) {  // Vectorized (i__ += 4)

			// Load elements of a
			a_v    = _dto128( _memd8_const( (void*) ptr_a ),
					          _memd8_const( (void*) (ptr_a + 2)) );


			// Load elements of x
			x_v    = _dto128( _memd8_const((void*) ptr_x),
					          _memd8_const((void*) (ptr_x + 2) ) );

			// Multiply by yscaled and accumulate
			temp_vp = _qmpysp( x_v, yscaled_v );

			temp_a_upper = _daddsp( _hif2_128( temp_vp ), _hif2_128( a_v ) );
			temp_a_lower = _daddsp( _lof2_128( temp_vp ), _lof2_128( a_v ) );

			*ptr_a     = _lof2( temp_a_upper );
			*(ptr_a+1) = _hif2( temp_a_upper );
			*(ptr_a+2) = _lof2( temp_a_lower );
			*(ptr_a+3) = _hif2( temp_a_lower );

			ptr_a += 4;
			ptr_x += 4;

		}
	}

	return 0;

}

0 Archaeologist over 13 years ago in reply to Francisco Igual

TI__Guru* 84285 points

When I compile your test case, the loop is not software pipelined, but for a different reason than shown in your post. What are your command-line options?

0 Francisco Igual over 13 years ago in reply to Archaeologist

Prodigy 60 points

Thanks for the prompt answer.

Attached is the .asm file I get, and here is the command line (the same result for version 7.3.1):

"/opt/ti/C6000CGT7.4.0B2/bin/cl6x" -mv6600 --abi=eabi -O2 --display_error_number --diag_warning=225 --openmp --gen_profile_info -k -z -m"Optimize_SGER.map" -i"/opt/ti/C6000CGT7.4.0B2/lib" -i"/opt/ti/C6000CGT7.4.0B2/include" -i"/opt/ti/edma3_lld_02_11_05_02/packages/ti/sdo/edma3/rm/lib/c6678-evm/66/release" -i"/opt/ti/edma3_lld_02_11_05_02/packages/ti/sdo/edma3/drv/lib/66/release" -i"/opt/ti/edma3_lld_02_11_05_02/packages/ti/sdo/edma3/drv/sample/lib/c6678-evm/66/release" -i"/home/dsps/projects/blasLevel3_Aug2012/lib/C66/" --reread_libs --warn_sections --rom_model -o "Optimize_SGER.out" -l"./configPkg/linker.cmd" "./src/sger.obj" "./src/my_sger.obj" "./src/main.obj" "./src/static/xerbla.obj" "./src/util/z_div.obj" "./src/util/r_imag.obj" "./src/util/r_cnjg.obj" "./src/util/d_imag.obj" "./src/util/d_cnjg.obj" "./src/util/c_div.obj" -ledma3_lld_drv.ae66 -ledma3_lld_rm.ae66 -ledma3_lld_drv_sample.ae66 -l"libc.a"

Thanks,

Fran

6644.my_sger.asm

0 Archaeologist over 13 years ago in reply to Francisco Igual

TI__Guru* 84285 points

It is the option --gen_profile_info which causes the compiler to insert control code into the innermost loop, preventing software pipelining. If you remove that option, it will pipeline just fine. I don't know whether this is expected behavior. I've submitted SDSCM00045395 to gather more information about this issue.

0 Francisco Igual over 13 years ago in reply to Archaeologist

Prodigy 60 points

Thanks, that solved the problem!

Code Composer Studio™︎

Code Composer Studio forum

Disqualified loop: Loop contains control code