// 4 element accumulation for interleaved 2 chans static int inline accumInter2Ch4Elements(const signed short *p_rowIn, const signed char *p_coef, int *restrict p_conOut1) { int convOut, convOut1; long long in0, out0, out1, coef0; long long *restrict p_srcA = (long long*)p_rowIn; long long *restrict p_srcB = (long long*)p_coef; in0 = _amem8(p_srcA); coef0 = _amem8(p_srcB); out0 = _ddotp4(_loll(in0), _loll(coef0)); out1 = _ddotp4(_hill(in0), _hill(coef0)); convOut = _loll(out0) + _loll(out1); convOut1 = _hill(out0) + _hill(out1); *p_conOut1 = convOut1; return convOut; } // to do the convolution operations without padding, the padding should be done before the convolution void conv2D_inner_loop(const signed short *restrict p_freqTimeChan, const unsigned int inCols, const signed char *restrict p_coef, const unsigned int freqKernelSize, signed short *restrict p_out, const unsigned int freqStride) { unsigned int inFreq, outFreq = 0; int convOut, convOut1, convOut2, convOut3; int convOutCh1, convOut1Ch1, convOut2Ch1, convOut3Ch1; const signed short *restrict p_rowIn, *restrict p_rowIn1, *restrict p_rowIn2, *restrict p_rowIn3; const unsigned int out_shift = 15; convOut1 = convOut2 = convOut3 = convOut = 0; convOut1Ch1 = convOut2Ch1 = convOut3Ch1 = convOutCh1 = 0; p_rowIn = p_freqTimeChan + inCols * outFreq; p_rowIn1 = p_freqTimeChan + inCols * (outFreq + 1 * freqStride); p_rowIn2 = p_freqTimeChan + inCols * (outFreq + 2 * freqStride); p_rowIn3 = p_freqTimeChan + inCols * (outFreq + 3 * freqStride); for (inFreq = 0; inFreq < freqKernelSize * inCols; inFreq+=4) { int cvOutCh1, cvOut1Ch1, cvOut2Ch1, cvOut3Ch1; convOut += accumInter2Ch4Elements(p_rowIn, p_coef, &cvOutCh1); convOut1 += accumInter2Ch4Elements(p_rowIn1, p_coef, &cvOut1Ch1); convOut2 += accumInter2Ch4Elements(p_rowIn2, p_coef, &cvOut2Ch1); convOut3 += accumInter2Ch4Elements(p_rowIn3, p_coef, &cvOut3Ch1); convOutCh1 = convOutCh1 + cvOutCh1; convOut1Ch1 = convOut1Ch1 + cvOut1Ch1; convOut2Ch1 = convOut2Ch1 + cvOut2Ch1; convOut3Ch1 = convOut3Ch1 + cvOut3Ch1; p_coef+=8; p_rowIn+=4; p_rowIn1+=4; p_rowIn2+=4; p_rowIn3+=4; } _amem4(&p_out[0]) = _spack2( (convOutCh1 >> out_shift), (convOut >> out_shift)); _amem4(&p_out[1]) = _spack2( (convOut1Ch1 >> out_shift), (convOut1 >> out_shift)); _amem4(&p_out[2]) = _spack2( (convOut2Ch1 >> out_shift), (convOut2 >> out_shift)); _amem4(&p_out[2]) = _spack2( (convOut3Ch1 >> out_shift), (convOut3 >> out_shift)); }