Part Number: TDA2
Hi,
I try release simple mean filter window 3x3. I use npt() and get the same result as host emulation (visual studio vcop.h library), but when i change npt() to deinterleave(), interleave(), the result is diffrent.
__vptr_uint16 bufx,
__vptr_uint8 output,
w = 640;
npt loops:
for (int I1 = 0; I1 < blockH; I1++)
{
for (int I2 = 0; I2 < w / VCOP_SIMD_WIDTH; I2++)
{
__agen indexOut = I1 * w * sizeof(*output) + I2 * sizeof(*output) * VCOP_SIMD_WIDTH;
__agen index = I1 * bufWidth * sizeof(*bufx) + I2 * sizeof(*bufx) * VCOP_SIMD_WIDTH;
Vec10 = (bufx)[index].npt();
Vec20 = (bufx + 2)[index].npt();
Vec30 = (bufx + 4)[index].npt();
Vec10 += Vec20 + Vec30;
Vec20 = Vec10 * VecMul;
Vec30 = Vec20 << Vshift;
output[indexOut].npt() = Vec30;
}
}
deinterleave(), interleave() loops:
for (int I1 = 0; I1 < blockH; I1++)
{
for (int I2 = 0; I2 < w / VCOP_2SIMD_WIDTH; I2++)
{
__agen indexout = I1 * w * sizeof(*output) + I2 * sizeof(*output) * VCOP_2SIMD_WIDTH;
__agen index0 = I1 * bufWidth * sizeof(*bufx) + I2 * sizeof(*bufx) * VCOP_2SIMD_WIDTH;
(Vec10, Vec11) = (bufx)[index0].deinterleave();
(Vec20, Vec21) = (bufx + sizeof(*bufx))[index0].deinterleave();
(Vec30, Vec31) = (bufx + 2 * sizeof(*bufx))[index0].deinterleave();
Vec10 += Vec20 + Vec30;
Vec11 += Vec21 + Vec31;
Vec20 = Vec10 * VecMul;
Vec21 = Vec11 * VecMul1;
Vec30 = Vec20 << Vshift;
Vec31 = Vec21 << Vshift1;
output[indexout].interleave() = (Vec30, Vec31);
}
}
Where is the problem? Thank you for help.