Hi ,
I want to use C66x intrinsics to optimise my code .
Below is some C code what I want to optimise by using DSP intrinsics .
I am new to DSP intrinsic ,so not having full knowledge of which intrinsic use for below logic .
uint8 const src[40] = = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40};
uint32_t width = 8;
uint32_t axay1_6 = 112345;
uint32_t axay2_6 = 123456;
uint32_t axay3_6 = 134567;
uint32_t axay4_6 = 145678;
C code:
uint8_t const *cLine = src;
uint8_t const *nLine = cLine + width;
uint32_t res = 0;
const uint32_t a1 = (*cLine++) * axay1_6;
const uint32_t a3 = (*nLine++) * axay3_6;
res = a1 + a3;
const uint32_t a2 = (*cLine) * axay2_6;
const uint32_t a4 = (*nLine) * axay4_6;
res += a2 + a4;
C66x Intrinscics :
const uint8_t *Ix00, *Ix01, *Iy00,*Iy01; uint32_t in1,in2; uint64_t l1, l2; __x128_t axay1_6 = _dup32_128(axay1_6); //112345 112345 112345 112345 __x128_t axay2_6 = _dup32_128(axay2_6); //123456 123456 123456 123456 __x128_t axay3_6 = _dup32_128(axay3_6); //134567 134567 134567 134567 __x128_t axay4_6 = _dup32_128(axay4_6); //145678 145678 145678 145678 Ix00 = src ; Ix01 = Ix00 + 1 ; Iy00 = src + width; Iy01 = Iy00 + 1; int64_t I_00 = _mem8_const(Ix00); //00 01 02 03 04 05 06 07 int64_t I_01 = _mem8_const(Ix01); //01 02 03 04 05 06 07 08 int64_t I_10 = _mem8_const(Iy00); //10 11 12 13 14 15 16 17 int64_t I_11 = _mem8_const(Iy01); //11 12 13 14 15 16 17 18 in1 = _loll(I_00); //00 01 02 03 l1 = _unpkbu4(in1); //00 01 02 03 (16x4) in2 = _hill(I_00); //04 05 06 07 l2 = _unpkbu4(in2); //04 05 06 07 (16x4)
Here I want one something __x128 register with 32*4 value containg " 00 01 02 03 " data .
So I can multiply __x128 into __x128 bit register and get __x128 bit value .Presently i am planning to use _qmpy32
I am new to this C66x DSP intrinscic .
Can you tell me which intrinsic is suitable to get __x128 type of register with 32x4 values with 00 01 02 03 .
(means how to convert 16 bit to 32 bit by using dsp intrinsic)