Part Number: PROCESSOR-SDK-J721S2
Other Parts Discussed in Thread: TDA4VH
PROCESSOR-SDK-RTOS-J721S2 Version: 09.00.00.02
Hi,
We implemented a transpose function on the DSP core using the C7x intrinsics as shown below:
void Transpose(
uint32_t* restrict inPtr,
uint32_t* restrict outPtr,
uint32_t inRows,
uint32_t inCols)
{
uint32_t rows;
uint32_t remainRows;
uint32_t blocks;
uint32_t maxVecLen = c7x::element_count_of<uint16>::value;
if (inRows <= maxVecLen)
{
rows = inRows;
blocks = 1;
remainRows = 0;
}
else
{
rows = maxVecLen;
blocks = inRows / maxVecLen;
remainRows = inRows % maxVecLen;
}
__SE_TEMPLATE_v1 seTemplate = __gen_SE_TEMPLATE_v1();
seTemplate.ELETYPE = __SE_ELETYPE_32BIT;
seTemplate.VECLEN = __SE_VECLEN_16ELEMS;
seTemplate.TRANSPOSE = __SE_TRANSPOSE_32BIT;
seTemplate.DIMFMT = __SE_DIMFMT_3D;
seTemplate.ICNT0 = inCols;
seTemplate.ICNT1 = rows;
seTemplate.ICNT2 = blocks;
seTemplate.DIM1 = inCols;
seTemplate.DIM2 = inCols * maxVecLen;
__SE0_OPEN((void *)inPtr, seTemplate);
__vpred vPred = __mask_int((uint32_t)rows);
for(int32_t blk = 0; blk < blocks; blk++)
{
for(int32_t col = 0; col < inCols; col++)
{
uint32_t outPtrOffset = (col * inRows) + (blk * maxVecLen);
uint16 vIn = __SE0ADV(uint16);
__vstore_pred(vPred, (uint16*)&outPtr[outPtrOffset], vIn);
}
}
__SE0_CLOSE();
if (0 < remainRows)
{
seTemplate = __gen_SE_TEMPLATE_v1();
seTemplate.ELETYPE = __SE_ELETYPE_32BIT;
seTemplate.VECLEN = __SE_VECLEN_16ELEMS;
seTemplate.TRANSPOSE = __SE_TRANSPOSE_32BIT;
seTemplate.DIMFMT = __SE_DIMFMT_3D;
seTemplate.ICNT0 = inCols;
seTemplate.ICNT1 = remainRows;
seTemplate.ICNT2 = 1;
seTemplate.DIM1 = inCols;
seTemplate.DIM2 = 0;
__SE0_OPEN((void *)&inPtr[blocks * maxVecLen * inCols], seTemplate);
vPred = __mask_int((uint32_t)remainRows);
for(int32_t col = 0; col < inCols; col++)
{
uint32_t outPtrOffset = (col * inRows) + (blocks * maxVecLen);
uint16 vIn = __SE0ADV(uint16);
__vstore_pred(vPred, (uint16*)&outPtr[outPtrOffset], vIn);
}
__SE0_CLOSE();
}
}
This code was tested on version: 09.00.00.02 for both:
- PROCESSOR-SDK-RTOS-J721S2
- PROCESSOR-SDK-RTOS-J721E
When executed in a loop, with inPtr and outPtr on L2Sram and on the TIVX_CPU_ID_DSP_C7_1 core, this function works correctly on J721E , however it randomly hangs on the J721S2.
Is there any difference between the two products or a bug within our code that leads to this behaviour.
Regards.