// 4 element accumulation for interleaved 2 chans
static int inline accumInter2Ch4Elements(const signed short *p_rowIn, const signed char *p_coef, int *restrict p_conOut1)
{
    int convOut, convOut1;
    long long in0, out0, out1, coef0;
    long long *restrict p_srcA = (long long*)p_rowIn;
    long long *restrict p_srcB = (long long*)p_coef;

    in0 = _amem8(p_srcA);   coef0 = _amem8(p_srcB);
    out0 = _ddotp4(_loll(in0), _loll(coef0));
    out1 = _ddotp4(_hill(in0), _hill(coef0));
    convOut = _loll(out0) + _loll(out1);
    convOut1 = _hill(out0) + _hill(out1);
    *p_conOut1 = convOut1;
    return convOut;
}

// to do the convolution operations without padding, the padding should be done before the convolution
void conv2D_inner_loop(const signed short *restrict p_freqTimeChan, const unsigned int inCols, const signed char *restrict p_coef, 
    const unsigned int freqKernelSize, signed short *restrict p_out, const unsigned int freqStride)
{
    unsigned int inFreq, outFreq = 0;
    int convOut, convOut1, convOut2, convOut3;
    int convOutCh1, convOut1Ch1, convOut2Ch1, convOut3Ch1;
    const signed short *restrict p_rowIn, *restrict p_rowIn1, *restrict p_rowIn2, *restrict p_rowIn3;
    const unsigned int out_shift = 15;

    convOut1 = convOut2 = convOut3 = convOut = 0;
    convOut1Ch1 = convOut2Ch1 = convOut3Ch1 = convOutCh1 = 0;
    p_rowIn = p_freqTimeChan + inCols * outFreq;
    p_rowIn1 = p_freqTimeChan + inCols * (outFreq + 1 * freqStride);
    p_rowIn2 = p_freqTimeChan + inCols * (outFreq + 2 * freqStride);
    p_rowIn3 = p_freqTimeChan + inCols * (outFreq + 3 * freqStride);
    for (inFreq = 0; inFreq < freqKernelSize * inCols; inFreq+=4)
    {
        int cvOutCh1, cvOut1Ch1, cvOut2Ch1, cvOut3Ch1;
        convOut += accumInter2Ch4Elements(p_rowIn, p_coef, &cvOutCh1);
        convOut1 += accumInter2Ch4Elements(p_rowIn1, p_coef, &cvOut1Ch1);
        convOut2 += accumInter2Ch4Elements(p_rowIn2, p_coef, &cvOut2Ch1);
        convOut3 += accumInter2Ch4Elements(p_rowIn3, p_coef, &cvOut3Ch1);
        convOutCh1 = convOutCh1 + cvOutCh1;
        convOut1Ch1 = convOut1Ch1 + cvOut1Ch1;
        convOut2Ch1 = convOut2Ch1 + cvOut2Ch1;
        convOut3Ch1 = convOut3Ch1 + cvOut3Ch1;
        p_coef+=8; p_rowIn+=4; p_rowIn1+=4; p_rowIn2+=4; p_rowIn3+=4;
    }
    _amem4(&p_out[0]) = _spack2( (convOutCh1 >> out_shift), (convOut >> out_shift));
    _amem4(&p_out[1]) = _spack2( (convOut1Ch1 >> out_shift), (convOut1 >> out_shift));
    _amem4(&p_out[2]) = _spack2( (convOut2Ch1 >> out_shift), (convOut2 >> out_shift));
    _amem4(&p_out[2]) = _spack2( (convOut3Ch1 >> out_shift), (convOut3 >> out_shift));
}