Hi,
Sample snippet:
for (int I1 = 0; I1 < dstBlockHeight; I1++)
{
for (int I2 = 0; I2 < dstBlockWidth / VCOP_SIMD_WIDTH; I2++)
{
AddrSrc = I1 * srcBlockWidth * ELEMINSIZE + I2 * ELEMINSIZE * VCOP_SIMD_WIDTH;
VInA = (in + 5 * ELEMINSIZE)[AddrSrc];
VInB = (in + 6 * ELEMINSIZE)[AddrSrc];
...
...
}
}
for (int I1 = 0; I1 < dstBlockHeight; I1++)
{
for (int I2 = 0; I2 < dstBlockWidth / VCOP_SIMD_WIDTH; I2++)
{
AddrSrc = I1 * srcBlockWidth * ELEMINSIZE + I2 * ELEMINSIZE * VCOP_SIMD_WIDTH;
VInA = (in + 60 * ELEMINSIZE)[AddrSrc];
VInB = (in + 61 * ELEMINSIZE)[AddrSrc];
...
...
}
}
for (int I1 = 0; I1 < dstBlockHeight; I1++)
{
for (int I2 = 0; I2 < dstBlockWidth / VCOP_SIMD_WIDTH; I2++)
{
AddrSrc = I1 * srcBlockWidth * ELEMINSIZE + I2 * ELEMINSIZE * VCOP_SIMD_WIDTH;
VInA = (in + 75 * ELEMINSIZE)[AddrSrc];
VInB = (in + 76 * ELEMINSIZE)[AddrSrc];
...
...
}
}
The above snippet may repeat for any number of iterations (calculated runtime).
Expected snippet:
void vcop_kernel_sample
(
__vptr_uint8_arr in,
__vptr_uint16_arr out,
unsigned int num_objects,
)
{
__vector Vin1, Vin2, Vin3;
foreach (I0, num_objects, 5)
{
for (int I1 = 0; I1 < dstBlockHeight; I1++)
{
for (int I2 = 0; I2 < dstBlockWidth / VCOP_SIMD_WIDTH; I2++)
{
AddrSrc = I1 * srcBlockWidth * ELEMINSIZE + I2 * ELEMINSIZE * VCOP_SIMD_WIDTH;
VInA = (in + 5 * ELEMINSIZE)[AddrSrc]; // LOAD
VInB = (in + 6 * ELEMINSIZE)[AddrSrc];
...
...
}
}
}
}
Here I am trying to make the VCOP kernel code generalized with the use of foreach loop.
The problem with the above approach is, the load operation uses different data from different position on each iteration (Eg: 5 and 6 on first iteration , 60 and 61 on second iteration and so on ..).
Is there any way to incorporate an array data into the loading operation part ? Please suggest a solution for the above scenario.