I have written some code for the MMA to simultaneously calculate the per-channel sum and sum of squares of an input image, as an exercise in learning the MMA.
Everything looks like it is working until the input data is negative, at which point I unexpectedly get a negative result in the sum of squares output - which should be impossible.
Can you help me understand what's going wrong?
Here is my initialisation code:
void mmaSumSumSqInit(const TIDL_bufParams3D_t *srcAddr, t_mmaSumSqConfig *config) { const uint32_t matrixColumns = __MMA_B_COLS(sizeof(int8_t)); const uint32_t matrixRowsPerRow = DIV_ROUND_UP(srcAddr->dim_x, matrixColumns); const uint32_t transfersPerRow = sizeof(int32_t) / sizeof(int8_t); config->numChannels = (uint32_t)srcAddr->dim_z; config->matrixRowsPerChannel = (uint32_t)srcAddr->dim_y * matrixRowsPerRow; config->seTemplate = __gen_SE_TEMPLATE_v1(); config->seTemplate.ELETYPE = __SE_ELETYPE_8BIT; config->seTemplate.VECLEN = c7x::se_veclen_from_traits<matrixColumns>::value; config->seTemplate.DIMFMT = __SE_DIMFMT_3D; config->seTemplate.ICNT0 = srcAddr->dim_x; config->seTemplate.ICNT1 = srcAddr->dim_y; config->seTemplate.ICNT2 = srcAddr->dim_z; config->seTemplate.DIM1 = srcAddr->stride_y / sizeof(int8_t); config->seTemplate.DIM2 = srcAddr->stride_z / sizeof(int8_t); config->mmaConfig = __gen_HWA_CONFIG_REG_v1(); config->mmaConfig.A_ATYPE = __MMA_A_CONFIG_ATYPE_INT8; config->mmaConfig.B_BTYPE = __MMA_B_CONFIG_SIZE8; config->mmaConfig.B_ORDER = __MMA_B_CONFIG_COL; // Load B down columns instead of across rows config->mmaConfig.B_BSWPER = config->matrixRowsPerChannel; // Switch B to load between channels config->mmaConfig.B_BRSTPER = 1; // Always load first column of B only config->mmaConfig.C_ATYPE = __MMA_C_CONFIG_ATYPE_SA; config->mmaConfig.C_BTYPE = __MMA_C_CONFIG_BTYPE_INT8; config->mmaConfig.C_OPERATION0 = __MMA_C_CONFIG_MUL; // C = AxB config->mmaConfig.C_OP0PER = 2; // Do this once per channel for sum and sumSq config->mmaConfig.C_OPERATION1 = __MMA_C_CONFIG_MULPLUS; // C += AxB config->mmaConfig.C_OP1PER = 2 * (config->matrixRowsPerChannel - 1); // Do this for the remaining ops per channel config->mmaConfig.C_BSWPER = 2 * config->matrixRowsPerChannel; // Switch B to read between channels config->mmaConfig.C_CRSWPER = 2 * config->matrixRowsPerChannel; // Switch C to read between channels config->mmaConfig.C_CWSWPER = 2 * config->matrixRowsPerChannel; // Switch C to write between channels config->mmaConfig.C_CRRSTPER = 2; // Alternate C read row between sum and sumSq config->mmaConfig.C_CWRSTPER = 2; // Alternate C write row between sum and sumSq config->mmaConfig.X_XTYPE = __MMA_X_CONFIG_XTYPE_INT32; config->mmaConfig.X_CTYPE = __MMA_X_CONFIG_CTYPE_INT32; config->mmaConfig.X_CSWPER = 2 * transfersPerRow; // Switch C to transfer between channels config->mmaConfig.X_CRRSTPER = 2 * transfersPerRow; // Alternate C transfer row between sum and sumSq config->mmaOffset = __gen_HWA_OFFSET_REG(); }
And here is my execution code:
void mmaSumSumSqExec(const t_mmaSumSqConfig *config, const int8_t *data, int32_t *sums, int32_t *sumSquares) { __SE0_OPEN((void *)data, config->seTemplate); __HWAOPEN(config->mmaConfig, config->mmaOffset, __MMA_OPEN_FSM_RESET); for(uint32_t ch = 0; ch < config->numChannels; ch++) { for(uint32_t i = 0; i < config->matrixRowsPerChannel; i++) { // A (row) = [1,1,...,1,1], B (col) = [in0,in1,...,in62,in63] __HWALDAB(1, __SE0(uchar64)); // C row 0 col 0 = Sum(1 x in0 + ... + 1 x in63) __HWAOP(__MMA_A_LDA); // A (row) = [in0,in1,...,in62,in63] __HWALDA(__SE0ADV(uchar64)); // C row 1 col 0 = Sum(in0 x in0 + ... + in63 x in63) __HWAOP(__MMA_A_LDA); } // Transfer C row 0 cols 0-15 to X __HWAXFER(__MMA_XFER_SRC_C); int16 sum = __as_int16(__HWARCV(0)); // TODO: There might be a way to use offset config to avoid having // to transfer remainder of row and discard it, but if we don't for now // the subsequent channel (on this C bank) will start at the wrong column // Discard C row 0 cols 16-63 __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); // Transfer C row 1 cols 0-15 to X __HWAXFER(__MMA_XFER_SRC_C); int16 sumSq = __as_int16(__HWARCV(0)); // Discard C row 1 cols 16-63 __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); __HWAXFER(__MMA_XFER_SRC_C); (void)__HWARCV(0); sums[ch] = sum.s[0]; sumSquares[ch] = sumSq.s[0]; } __HWACLOSE(0); __SE0_CLOSE(); }
With positive input values, everything works correctly:
[ 29.829513] logs[933]: [C7x_1 ] 43.745416 s: Input row: 90,90,90,90,90,90,90,90,90,89,89,89,89,89,89,89,89,89,89,89,89,88,89,89,89,89,90,89,89,90,89,89,89,88,88,89,89,89,89,89,89,89,89,88,88,88,8, [ 29.829723] logs[933]: [C7x_1 ] 43.745518 s: Status: 0x00010000000100000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 [ 29.829830] logs[933]: [C7x_1 ] 43.745606 s: Status: 0x00010000000104000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 [ 29.829898] logs[933]: [C7x_1 ] 43.745694 s: Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 [ 29.829988] logs[933]: [C7x_1 ] 43.745782 s: Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 [ 29.830062] logs[933]: [C7x_1 ] 43.745869 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000800000008000000000000000000000000000000000000000000 [ 29.830137] logs[933]: [C7x_1 ] 43.745957 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000700000007000000000100000000000000000000000000000000 [ 29.830201] logs[933]: [C7x_1 ] 43.746049 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000300000003000000000500000000000000000000000000000000 [ 29.830268] logs[933]: [C7x_1 ] 43.746086 s: MMA Ch0 S/SumSq: 5682 (1632)/504494 (7b2ae) [ 29.830331] logs[933]: [C7x_1 ] 43.746107 s: Ref Ch0 S/SumSq: 5682 (1632)/504494 (7b2ae)
But with negative input values, we get a negative sum of squares(!):
[ 29.799393] logs[933]: [C7x_1 ] 43.712012 s: Input row: 0,-128,0,127,-128,0,-128,0,0,-128,-128,-128,127,-128,0,127,-128,0,-128,-128,0,-128,-128,0,-128,-128,-128,0,-128,0,-128,-128,-128,-128,0,0,-, [ 29.799576] logs[933]: [C7x_1 ] 43.712120 s: Status: 0x00010000000100000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 [ 29.799694] logs[933]: [C7x_1 ] 43.712210 s: Status: 0x00010000000104000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 [ 29.799765] logs[933]: [C7x_1 ] 43.712300 s: Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 [ 29.799833] logs[933]: [C7x_1 ] 43.712388 s: Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 [ 29.799900] logs[933]: [C7x_1 ] 43.712476 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000800000008000000000000000000000000000000000000000000 [ 29.799980] logs[933]: [C7x_1 ] 43.712564 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000700000007000000000100000000000000000000000000000000 [ 29.800083] logs[933]: [C7x_1 ] 43.712656 s: Status: 0x00010000000104000100000002000000020000000200000000020000000200000000380000000300000003000000000500000000000000000000000000000000 [ 29.800149] logs[933]: [C7x_1 ] 43.712695 s: MMA Ch0 S/SumSq: -3591 (fffff1f9)/-460537 (fff8f907) [ 29.800213] logs[933]: [C7x_1 ] 43.712718 s: Ref Ch0 S/SumSq: -3591 (fffffffffffff1f9)/686343 (a7907)
Although note that the HWASTATUS output is identical in both cases.
Furthermore, on host emulation, the MMA gives the correct result for the same input:
Status: 0x00010000000100000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 Status: 0x00010000000104000200000002000000020000000200000000020000000200000000000000000800000008000000000000000000000000000000000000000000 Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 Status: 0x00010000000104000100000001000000010000000100000000010000000100000000000101000800000008000000000000000000000000000000000000000000 Status: 0x00010000000104000000000002000000020000000200000000020000000200000000380000000800000008000000000000000000000000000000000000000000 Status: 0x00010000000104000000000002000000020000000200000000020000000200000000380000000700000007000000000100000000000000000000000000000000 Status: 0x00010000000104000000000002000000020000000200000000020000000200000000380000000300000003000000000500000000000000000000000000000000 MMA Ch 0 S/SumSq: -5369 (ffffeb07)/686343 (a7907) Ref Ch0 S/SumSq: -5369 (ffffffffffffeb07)/686343 (a7907)
Can you please help explain what's going on here?
Many Thanks,
Ross