Running the attached code on a DSP core (C66) on the AM5728.
This code was purchased from TI. We have unit test code that uses this function that was developed on a 6482 and is my working baseline. I am now porting this code to the AM5728. The function in question is LTELIB_genRefSignals_shft(). For a given set of inputs,
numSubCarrAlloc = 36
N_ZC = 31
u = 1
v = 0
alpha = 10
I should get an output vector in OutputSequence[].
That output vector changes depending on the optimizer setting. Opt_level = off, 0, 1 works, Opt_level = 2,3 doesn't. The sequence changes, quite dramatically actually.
Working vector: 0 32766 -32537 3864 5508 -32300 21343 24860 30322 -12407 31089 10332
-25906 -20049 5509 32288 27192 -18257 -18711 -26875 -31955 7138 -17778 27490
-6582 32064 -7659 31818 -20456 25540 -32673 1668 -11891 -30483 31384 -9277
Broken vector: 0 32766 -32537 3864 17796 -32768 21343 24860 -30322 -12407 31089 10332
0 0 3 1 32767 -14864 0 0 -31955 7138 -17778 27490
-6582 32064 -7659 31818 -32768 26573 -32673 1668 -11891 -30483 31384 -9277
Sure, I can turn the optimizer off for this file, but we have dozens of files in this library that use intrinsics that could be potentially broken and I wouldn't know it. I am hoping to identify the offending code so through inspection we can find other possible offenders.
/**
* \file LTELIB_genUL_PUSCH_demodRS.c
*
* \brief Functions to generate UL PUSCH demodulator reference signal
*
* Copyright (c) Texas Instruments Incorporated 2007
*
* Use of this software is controlled by the terms and conditions found in the
* license agreement under which this software has been supplied or provided.
*
*/
#include "swpform.h"
#include "LTELIB_genUL_PUSCH_demodRS.h"
#include "table_short_PUSCH_ref_sig.h"
#include "table_sin_cos.h"
#ifdef _TMS320C6X
#include "C6x.h"
#endif
#define RESETINTV 16 /* 0 means never, 1 means always */
/**
* \fn LTELIB_genRefSignals(
* OUT cplx16_t OutputSequence[RESTRICT],
* IN Uint16 numSubCarrAlloc,
* IN Uint16 N_ZC,
* IN Uint16 u,
* IN Uint16 v);
*
* \brief Generates the UL PUSCH demodulator reference signal excluding the shift with the factor alpha *
*
* \param[out] OutputSequence
* Pointer to a vector where the uplink demodulation reference signal is stored. The length of
* the vector is the numSubCarrAlloc entries.
*
* \param[in] numSubCarrAlloc
* Number of subcarriers allocated to the allocation.
*
*
* \param[in] N_ZC
* Number of Zadoff-Chu sequence elements; related to numSubCarrAlloc as the largest prime number
* less than numSubCarrAlloc.
*
*
* \param[in] u
* Base sequence group number .
*
* \param[in] v
* Base sequence number within the group.
*
* \pre none
*
* \post none
*
* \sa none
*
*/
void LTELIB_genRefSignals(
OUT cplx16_t OutputSequence[RESTRICT],
IN Uint16 numSubCarrAlloc,
IN Uint16 N_ZC,
IN Uint16 u,
IN Uint16 v)
{
Int32 expn1,expn;
Int32 normal;
Uint32 a, b;
Int32 i;
Uint32 xr, mask;
Int32 c,d,k;
Uint32 a2;
Uint32 a1;
Int32 cscoartab, csfinetab;
Uint32 CosSinIndx;
Int32 rc;
Uint32 ttmp1, ttmp2;
Uint64 lltemp;
Int32 * RESTRICT tempPtr1, * RESTRICT tempPtr2;
Uint16 q;
Int32 qbar;
// qbar=_mpyhl(ONE_OVER_31,u+1)+1;
// qbar=_mpylir(u+1,ONE_OVER_31)+1;
u++;
normal = _norm(u);
qbar = _mpyhir(u<<normal,ONE_OVER_31)<<1;
qbar = _mpylir(N_ZC,qbar);
// q= (Uint16) ( (qbar + 0x00004000)>>15); /* Add 1/2 and do floor operator */
q = (Uint16) ( (qbar + (1 << (normal-15)))>>(normal-14)); /* Add 1/2 and do floor operator */
// qbar >>= 14; /* Shift by 14 because I'm doing floor operation after multiplying by 2 */
qbar >>= (normal-15); /* Shift by normal-15 because I'm doing floor operation after multiplying by 2 */
if( (qbar & 0x00000001) == 0) /* even ? */
{
q += v;
}
else
{
q -= v;
}
normal = _norm(N_ZC);
a = (N_ZC << normal) & 0x7fff0000;
b = 0x80000000; /* dividend = 1 */
#ifdef _TMS320C6X
#pragma MUST_ITERATE(15,15);
#endif
for( i = 15; i > 0; i-- )
{
b = _subc(b,a); /* divide */
}
/* Work with reminder to improve precision */
xr = ( b & 0x7fff ) << ( normal - 14 );
expn = _norm(xr) + 1;
xr <<= expn;
b = ( b & 0xffff0000 ) >> 1;
expn1 = _norm(b);
b = ( b << expn1 ) & 0xffff0000;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(15,15);
#endif
for( i = 15; i > 0; i-- )
{
b = _subc(b,a); /* divide */
}
b = _sshvl( b & 0xffff, normal + expn - expn1 - 28 );
xr |= b;
a1 = xr;
lltemp = _mpy32u(xr, q);
ttmp1 = _hill(lltemp) << ( 33 - expn);
ttmp2 = _loll(lltemp) >> (expn - 1);
xr = ttmp1 | ttmp2;
CosSinIndx = xr >> 32 - HALFTABLERESOL;
cscoartab = _amem4( (Int32 *)CosSinCoarse + CosSinIndx );
mask = 0xffffffff>>32 - HALFTABLERESOL;
CosSinIndx = (xr >> ( 32-(HALFTABLERESOL << 1 ) ) ) & mask;
csfinetab = _amem4( (Int32 *)CosSinFine + CosSinIndx );
#ifdef _LITTLE_ENDIAN
d = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
d = _cmpyr1(cscoartab, csfinetab);
d = _packhl2(d, _ssub2(0, d));
#endif
k = d;
c = 0x7fff0000;
_mem4(&OutputSequence[0]) = c;
_mem4(&OutputSequence[N_ZC - 1])= c;
_mem4(&OutputSequence[N_ZC]) = c;
rc = 0;
while(( RESETINTV + rc ) <= ( N_ZC>>1 ))
{
tempPtr1 = (Int32 *) &OutputSequence[1 + rc];
tempPtr2 = (Int32 *) &OutputSequence[N_ZC - 2 - rc];
#ifdef _TMS320C6X
#pragma MUST_ITERATE(RESETINTV - 1);
#endif
for( i = 1 + rc; i < RESETINTV + rc; i++)
{
c = _cmpyr1(c, d);
d = _cmpyr1(k, d);
if( i + N_ZC < numSubCarrAlloc )
{
_amem4( tempPtr1 + N_ZC) = c;
}
_amem4(tempPtr1++) = c;
_amem4(tempPtr2--) = c;
}
a2 = q * (i + 1);
lltemp = _mpy32u(a2, a1);
ttmp1 = _hill(lltemp) << ( 33 - expn);
ttmp2 = _loll(lltemp) >> (expn - 1);
xr = ttmp1 | ttmp2;
csfinetab = _amem4( (Int32 *)CosSinFine + ((xr >> (32 - (HALFTABLERESOL << 1)) ) & mask ) );
cscoartab = _amem4( (Int32 *)CosSinCoarse + (xr >> 32 - HALFTABLERESOL) );
#ifdef _LITTLE_ENDIAN
d = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
d = _cmpyr1(cscoartab, csfinetab);
d = _packhl2(d, _ssub2(0, d));
#endif
a2 *= i;
lltemp = _mpy32u(a2, a1);
ttmp1 = _hill(lltemp) << ( 32 - expn);
ttmp2 = _loll(lltemp) >> (expn);
xr = ttmp1 | ttmp2;
csfinetab = _amem4( (Int32 *)CosSinFine + (( xr >> (32 - (HALFTABLERESOL << 1))) & mask ));
cscoartab = _amem4( (Int32 *)CosSinCoarse + (xr >> (32 - HALFTABLERESOL)));
#ifdef _LITTLE_ENDIAN
c = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
c = _cmpyr1(cscoartab, csfinetab);
c = _packhl2(c, _ssub2(0, c));
#endif
_amem4(&OutputSequence[i]) = c;
_amem4(&OutputSequence[N_ZC - i - 1]) = c;
if(i + N_ZC < numSubCarrAlloc)
{
_amem4( &OutputSequence[ i + N_ZC ] ) = c;
}
rc += RESETINTV;
}
for( i++; i <= (N_ZC >> 1); i++ )
{
c = _cmpyr1(c, d);
d = _cmpyr1(k, d);
_amem4(&OutputSequence[i]) = c;
_amem4(&OutputSequence[N_ZC - i - 1]) = c;
if( i + N_ZC < numSubCarrAlloc )
{
_amem4( &OutputSequence[ i + N_ZC ] ) = c;
}
}
return;
}
/**
* \fn void LTELIB_genRefSignals_shft(
* OUT cplx16_t OutputSequence[RESTRICT],
* IN Uint16 numSubCarrAlloc,
* IN Uint16 N_ZC,
* IN Uint16 thetaOffset ,
* IN Uint16 u_root,
* IN Uint16 alpha,
* IN Int32 ShiftSequence[RESTRICT] );
*
* \brief Generates the UL PUSCH demodulator reference signal including the shift with the factor alpha
*
* \param[out] OutputSequence
* Pointer to a vector where the uplink demodulation reference signal is stored. The length of
* the vector is numSubCarrAlloc entries.
*
* \param[in] numSubCarrAlloc
* Number of subcarriers allocated to the allocation.
*
*
* \param[in] N_ZC
* Number of Zadoff-Chu elements in sequence; related to numSubCarrAlloc as the largest prime number
* less than numSubCarrAlloc.
*
*
* \param[in] u
* Base sequence group number .
*
* \param[in] v
* Base sequence number within the group.
*
* \param[in] alpha
* Cyclic shift in the time domain. Alpha takes integer values between 0 and 11. The cyclic shift is actually
* calculated as exp(j*2*pi*alpha*index/12).
*
* \pre none
*
* \post none
*
* \sa none
*
*/
void LTELIB_genRefSignals_shft(
OUT cplx16_t OutputSequence[RESTRICT],
IN Uint16 numSubCarrAlloc,
IN Uint16 N_ZC,
IN Uint16 u,
IN Uint16 v,
IN Uint16 alpha)
{
Int32 expn1,expn;
Int32 normal, normal1;
Uint32 a, b;
Int32 i;
Uint32 xr, mask;
Int32 c,d,k;
Uint32 a2;
Uint32 a1;
Int32 cscoartab, csfinetab;
Uint32 CosSinIndx;
Int32 rc;
Int32 z, s;
Uint32 ShiftExp;
Uint32 ttmp1, ttmp2;
Uint64 lltemp;
Int32 * RESTRICT tempPtr1, * RESTRICT tempPtr2;
Int32 MkeMdl12, MkeMdl12h, MkeMdl12o;
Int32 ShiftSequence[12];
Uint16 q;
Int32 qbar;
u++;
normal = _norm(u);
qbar = _mpyhir(u << normal, ONE_OVER_31) << 1;
qbar = _mpylir(N_ZC, qbar);
q = (Uint16) ( (qbar + (1 << (normal-15)))>>(normal - 14)); /* Add 1/2 and do floor operator */
qbar >>= (normal - 15); /* Shift by normal-15 because I'm doing floor operation after multiplying by 2 */
if( (qbar & 0x00000001) == 0) /* even ? */
{
q += v;
}
else
{
q -= v;
}
normal = _norm(N_ZC);
a = (N_ZC << normal) & 0x7fff0000;
b = 0x80000000; /* dividend = 1 */
#ifdef _TMS320C6X
#pragma MUST_ITERATE(15,15);
#endif
for(i = 15; i > 0; i--)
{
b = _subc(b,a); /* divide */
}
/* Work with reminder to improve precision */
xr = ( b & 0x7fff ) << (normal - 14);
expn = _norm(xr) + 1;
xr <<= expn;
b = ( b & 0xffff0000 ) >> 1;
expn1 = _norm(b);
b = (b << expn1) & 0xffff0000;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(15,15);
#endif
for( i = 15; i > 0; i--)
{
b = _subc(b,a); /* divide */
}
b = _sshvl( b & 0xffff, normal + expn - expn1 - 28);
xr |= b;
a1 = xr;
lltemp = _mpy32u(xr, q);
ttmp1 = _hill(lltemp) << ( 33 - expn);
ttmp2 = _loll(lltemp) >> (expn - 1);
xr = ttmp1 | ttmp2;
CosSinIndx = xr >> 32 - HALFTABLERESOL;
cscoartab = _amem4((Int32 *)CosSinCoarse + CosSinIndx);
mask = 0xffffffff >> 32 - HALFTABLERESOL;
CosSinIndx = (xr >> (32-(HALFTABLERESOL<<1))) & mask;
csfinetab = _amem4( (Int32 *)CosSinFine+CosSinIndx );
#ifdef _LITTLE_ENDIAN
d = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
d = _cmpyr1(cscoartab, csfinetab);
d = _packhl2(d, _ssub2(0, d));
#endif
k = d;
c = 0x7fff0000;
normal1 = _norm(alpha);
ShiftExp = alpha << normal1;
ShiftExp = _mpyhir( ShiftExp, ONE_OVER_THREE ) << (30 - normal1); /* Implicit in the shift of mpyhir is the division by to have division by 12 in Q32 */
CosSinIndx = ShiftExp >> (32 - HALFTABLERESOL); /* Since alpha is Q32 and normalized to 2*pi it can be used directly as index */
cscoartab = _amem4( (Int32 *)CosSinCoarse + CosSinIndx);
CosSinIndx = (ShiftExp >> (32-(HALFTABLERESOL<<1)) ) & mask;
csfinetab = _amem4( (Int32 *)CosSinFine + CosSinIndx );
#ifdef _LITTLE_ENDIAN
z = _packh2( _sshvl( (_mpy(cscoartab,csfinetab) - _mpyh(cscoartab,csfinetab)), 1), _sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#else
z = _packh2( _sshvl( (_mpyh(cscoartab,csfinetab) - _mpy(cscoartab,csfinetab)), 1), _sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#endif
s = 0x7fff0000; /* For the cyclic shift given by alpha */
ShiftSequence[0]= 0x7fff0000;
ShiftSequence[1]= z;
i = 0;
rc = 0;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(11,11);
#endif
for( i = 2; i < 12; i++ )
{
s = alpha * i;
normal1 = _norm(s);
ShiftExp = s << normal1;
ShiftExp = _mpyhir( ShiftExp, ONE_OVER_THREE ) << (30 - normal1); /* Implicit in the shift of mpyhir is the division by to have division by 12 in Q32 */
CosSinIndx = ShiftExp >> (32 - HALFTABLERESOL); /* Since alpha is Q32 and normalized to 2*pi it can be used directly as index */
cscoartab = _amem4( (Int32 *)CosSinCoarse + CosSinIndx);
CosSinIndx = (ShiftExp >> (32-(HALFTABLERESOL<<1)) ) & mask;
csfinetab = _amem4( (Int32 *)CosSinFine + CosSinIndx );
#ifdef _LITTLE_ENDIAN
z = _packh2( _sshvl( (_mpy(cscoartab,csfinetab) - _mpyh(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#else
z = _packh2( _sshvl( (_mpyh(cscoartab,csfinetab) - _mpy(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#endif
ShiftSequence[i] = z;
}
i = 0;
MkeMdl12 = 1;
MkeMdl12h = N_ZC - 2;
while( MkeMdl12h > 0 )
{
MkeMdl12h -= 12;
}
MkeMdl12h += 12;
MkeMdl12o = MkeMdl12h + 3;
if( MkeMdl12o > 11 )
{
MkeMdl12o -= 12;
}
_amem4(&OutputSequence[0]) = _cmpyr1(ShiftSequence[0], c);
if( (MkeMdl12h + 1) < 12)
{
_amem4(&OutputSequence[N_ZC - 1]) = _cmpyr1(ShiftSequence[MkeMdl12h + 1], c);
}
else
{
_amem4(&OutputSequence[N_ZC - 1]) = _cmpyr1(ShiftSequence[MkeMdl12h - 11], c);
}
if(( MkeMdl12h + 2 ) < 12)
{
_amem4( &OutputSequence[ N_ZC ] ) = _cmpyr1(ShiftSequence[MkeMdl12h + 2], c);
}
else
{
_amem4( &OutputSequence[ N_ZC ] ) = _cmpyr1(ShiftSequence[MkeMdl12h - 10], c);
}
rc = 0;
while( (RESETINTV + rc) <= ( N_ZC >> 1 ) )
{
tempPtr1 = (Int32 *) &OutputSequence[1 + rc];
tempPtr2 = (Int32 *) &OutputSequence[N_ZC - 2 - rc];
#ifdef _TMS320C6X
#pragma MUST_ITERATE(RESETINTV-1,RESETINTV-1);
#endif
for( i = 1 + rc; i < RESETINTV + rc; i++ )
{
c = _cmpyr1(c, d);
d = _cmpyr1(k, d);
if( i + N_ZC < numSubCarrAlloc )
{
_amem4( tempPtr1 + N_ZC) = _cmpyr1(ShiftSequence[MkeMdl12o++], c);
if(MkeMdl12o == 12)
{
MkeMdl12o = 0;
}
}
_amem4(tempPtr1++) = _cmpyr1(ShiftSequence[MkeMdl12++], c);
if(MkeMdl12 == 12)
{
MkeMdl12 = 0;
}
_amem4(tempPtr2--) = _cmpyr1(ShiftSequence[MkeMdl12h--], c);
if( MkeMdl12h < 0 )
{
MkeMdl12h = 11;
}
}
a2 = q * (i + 1);
lltemp = _mpy32u(a2, a1);
ttmp1 = _hill(lltemp) << ( 33 - expn);
ttmp2 = _loll(lltemp) >> (expn - 1);
xr = ttmp1 | ttmp2;
csfinetab = _amem4( (Int32 *)CosSinFine + ((xr >> (32 - (HALFTABLERESOL << 1)) ) & mask ) );
cscoartab = _amem4( (Int32 *)CosSinCoarse + (xr >> 32 - HALFTABLERESOL) );
#ifdef _LITTLE_ENDIAN
d = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
d = _cmpyr1(cscoartab, csfinetab);
d = _packhl2(d, _ssub2(0, d));
#endif
a2 *= i;
lltemp = _mpy32u(a2, a1);
ttmp1 = _hill(lltemp) << ( 32 - expn);
ttmp2 = _loll(lltemp) >> (expn);
xr = ttmp1 | ttmp2;
csfinetab = _amem4( (Int32 *)CosSinFine + (( xr >> (32 - (HALFTABLERESOL << 1))) & mask ));
cscoartab = _amem4( (Int32 *)CosSinCoarse + (xr >> (32 - HALFTABLERESOL)));
#ifdef _LITTLE_ENDIAN
c = _ssub2(0, _cmpyr1(cscoartab, csfinetab));
#else
c = _cmpyr1(cscoartab, csfinetab);
c = _packhl2(c, _ssub2(0, c));
#endif
if(i + N_ZC < numSubCarrAlloc)
{
_amem4( &OutputSequence[ i + N_ZC ] ) = _cmpyr1(ShiftSequence[MkeMdl12o++], c);
if(MkeMdl12o == 12)
{
MkeMdl12o = 0;
}
}
_amem4(&OutputSequence[i]) = _cmpyr1(ShiftSequence[MkeMdl12++], c);
if(MkeMdl12 == 12)
{
MkeMdl12 = 0;
}
_amem4(&OutputSequence[N_ZC - i - 1]) = _cmpyr1(ShiftSequence[MkeMdl12h--], c);
if(MkeMdl12h < 0)
{
MkeMdl12h = 11;
}
rc += RESETINTV;
}
for( i++; i <= ( N_ZC >> 1); i++ )
{
c = _cmpyr1(c, d);
d = _cmpyr1(k, d);
if(i + N_ZC < numSubCarrAlloc)
{
_amem4( &OutputSequence[ i + N_ZC ] ) = _cmpyr1(ShiftSequence[MkeMdl12o++], c);
if( MkeMdl12o == 12 )
{
MkeMdl12o = 0;
}
}
_amem4(&OutputSequence[i]) = _cmpyr1(ShiftSequence[MkeMdl12++], c);
if( MkeMdl12 == 12 )
{
MkeMdl12 = 0;
}
_amem4(&OutputSequence[N_ZC - i - 1]) = _cmpyr1(ShiftSequence[MkeMdl12h--], c);
if( MkeMdl12h < 0 )
{
MkeMdl12h = 11;
}
}
return;
}
/**
* \fn LTELIB_genRefSignals_short(
* OUT cplx16_t OutputSequence[RESTRICT],
* IN Uint16 numSubCarrAlloc,
* IN Uint16 SeqIndx);
*
* \brief Generates the UL PUSCH demodulator reference signal for lengths 12 or 24.
*
* \param[out] OutputSequence
* Pointer to a vector where the uplink demodulation reference signal is stored. The length of
* the vector is numSubCarrAlloc entries.
*
* \param[in] numSubCarrAlloc
* Number of subcarriers allocated to the allocation (possible values are 12 or 24).
*
* \param[in] SeqIndx
* Index into the table containing the sequences.
*
*
* \pre none
*
* \post none
*
* \sa none
*
*/
void LTELIB_genRefSignals_short(
OUT cplx16_t OutputSequence[RESTRICT],
IN Uint16 numSubCarrAlloc,
IN Uint16 SeqIndx)
{
Int32 Count;
Int64 * RESTRICT tempPtr1;
if( numSubCarrAlloc == 12 )
{
tempPtr1 = ((Int64 *) refSeqLen12) + SeqIndx * 6;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(6,6);
#endif
for( Count = 0; Count < 6; Count++ )
_amem8( (Int64 *)OutputSequence + Count) = _amem8(tempPtr1 + Count);
}
else if( numSubCarrAlloc == 24 )
{
tempPtr1 = ((Int64 *) refSeqLen24) + SeqIndx * 12;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(12, 12);
#endif
for( Count = 0; Count < 12; Count++ )
_amem8( (Int64 *)OutputSequence + Count) = _amem8(tempPtr1 + Count);
}
return;
}
/**
* \fn LTELIB_genRefSignals_short_shft(
* OUT cplx16_t OutputSequence[RESTRICT],
* IN Uint16 numSubCarrAlloc,
* IN Uint16 SeqIndx);
*
* \brief Generates the UL PUSCH demodulator reference signal for lengths 12 or 24 with cyclic shift to separate streams.
*
* \param[out] OutputSequence
* Pointer to a vector where the uplink demodulation reference signal is stored. The length of
* the vector is numSubCarrAlloc entries.
*
* \param[in] numSubCarrAlloc
* Number of subcarriers allocated to the allocation (possible values are 12 or 24).
*
* \param[in] SeqIndx
* Index into the table containing the sequences.
*
* \param[in] alpha
* Cyclic shift in the time domain. Alpha takes integer values between 0 and 11. The cyclic shift is actually
* calculated as exp(j*2*pi*alpha*index/12).
*
* \pre none
*
* \post none
*
* \sa none
*
*/
void LTELIB_genRefSignals_short_shft(
OUT cplx16_t OutputSequence[RESTRICT],
IN Uint16 numSubCarrAlloc,
IN Uint16 SeqIndx,
IN Uint16 alpha)
{
Int32 Count;
Int32 * RESTRICT tempPtr1;
Int32 ShiftSequence[12];
Int32 cscoartab, csfinetab;
Uint32 CosSinIndx;
Int32 z,s,i;
Uint32 ShiftExp;
Int32 normal1;
Uint32 mask;
normal1 = _norm(alpha);
ShiftExp = alpha << normal1;
ShiftExp = _mpyhir( ShiftExp, ONE_OVER_THREE ) << (30 - normal1); /* Implicit in the shift of mpyhir is the division by to have division by 12 in Q32 */
CosSinIndx = ShiftExp >> (32 - HALFTABLERESOL); /* Since alpha is Q32 and normalized to 2*pi it can be used directly as index */
cscoartab = _amem4( (Int32 *)CosSinCoarse + CosSinIndx);
mask = 0xffffffff >> (32 - HALFTABLERESOL);
CosSinIndx = (ShiftExp >> (32 - (HALFTABLERESOL << 1)) ) & mask;
csfinetab = _amem4( (Int32 *)CosSinFine + CosSinIndx );
#ifdef _LITTLE_ENDIAN
z = _packh2( _sshvl( (_mpy(cscoartab,csfinetab) - _mpyh(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#else
z = _packh2( _sshvl( (_mpyh(cscoartab,csfinetab) - _mpy(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#endif
ShiftSequence[0]= 0x7fff0000;
ShiftSequence[1]= z;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(11,11);
#endif
for( i = 2; i < 12; i++ )
{
s = alpha * i;
normal1 = _norm(s);
ShiftExp = s << normal1;
ShiftExp = _mpyhir( ShiftExp, ONE_OVER_THREE ) << (30 - normal1); /* Implicit in the shift of mpyhir is the division by to have division by 12 in Q32 */
CosSinIndx = ShiftExp >> (32 - HALFTABLERESOL); /* Since alpha is Q32 and normalized to 2*pi it can be used directly as index */
cscoartab = _amem4( (Int32 *)CosSinCoarse + CosSinIndx);
CosSinIndx = (ShiftExp >> (32 - (HALFTABLERESOL << 1)) )&mask;
csfinetab = _amem4( (Int32 *)CosSinFine + CosSinIndx );
#ifdef _LITTLE_ENDIAN
z = _packh2( _sshvl( (_mpy(cscoartab,csfinetab) - _mpyh(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#else
z = _packh2( _sshvl( (_mpyh(cscoartab,csfinetab) - _mpy(cscoartab,csfinetab)), 1),
_sshvl( (_mpyhl(cscoartab,csfinetab) + _mpylh(cscoartab,csfinetab)), 1) );
#endif
ShiftSequence[i] = z;
}
if( numSubCarrAlloc == 12 )
{
tempPtr1 = ((Int32 *) refSeqLen12) + SeqIndx * 12;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(12,12);
#endif
for( Count = 0; Count < 12; Count++ )
_amem4( OutputSequence + Count) = _cmpyr1(ShiftSequence[Count], _amem4(tempPtr1 + Count));
}
else if( numSubCarrAlloc == 24 )
{
tempPtr1 = ((Int32 *) refSeqLen24) + SeqIndx * 24;
#ifdef _TMS320C6X
#pragma MUST_ITERATE(12,12);
#endif
for( Count = 0; Count < 12; Count++ )
_amem4( OutputSequence + Count) = _cmpyr1(ShiftSequence[Count], _amem4(tempPtr1 + Count));
#ifdef _TMS320C6X
#pragma MUST_ITERATE(12,12);
#endif
for( Count = 12; Count < 24; Count++ )
_amem4( OutputSequence+Count) = _cmpyr1(ShiftSequence[Count - 12], _amem4(tempPtr1 + Count));
}
return;
}