static void inline _smpy2_hi_lo (int src1, int src2, int *restrict out_hi, int *restrict out_lo) { long long out = _smpy2ll(src1, src2); *out_hi = _hill(out); *out_lo = _loll(out); } void ym_mult_q15(short *restrict pSrcA, short *restrict pSrcB, short *restrict pDst, unsigned int blockSize) { unsigned int blkCnt; /* loop counters */ int inA1, inA2, inB1, inB2; /* temporary input variables */ long long inA, inB; long long *restrict p_srcA = (long long*)pSrcA; long long *restrict p_srcB = (long long*)pSrcB; long long *restrict p_dst = (long long*)pDst; _nassert(((int)pSrcA & 0x07) == 0); _nassert(((int)pSrcB & 0x07) == 0); _nassert(((int)pDst & 0x07) == 0); _nassert((int)blockSize > 0); _nassert((int)(blockSize & 7) == 0); /* loop Unrolling */ blkCnt = blockSize >> 3U; while (blkCnt > 0U) { int out_lo1, out_hi1, out_hi2, out_lo2; /* read two samples at a time from sourceA */ inA = _amem8(p_srcA); p_srcA++; inA1 = _loll(inA); inA2 = _hill(inA); /* read two samples at a time from sourceB */ inB = _amem8(p_srcB); p_srcB++; inB1 = _loll(inB); inB2 = _hill(inB); /* multiply mul = sourceA * sourceB */ _smpy2_hi_lo(inA1, inB1, &out_hi1, &out_lo1); _smpy2_hi_lo(inA2, inB2, &out_hi2, &out_lo2); /* store the result */ _amem8(p_dst) = _itoll(_packh2(out_hi2, out_lo2), _packh2(out_hi1, out_lo1)); p_dst++; inA = _amem8(p_srcA); p_srcA++; inA1 = _loll(inA); inA2 = _hill(inA); /* read two samples at a time from sourceB */ inB = _amem8(p_srcB); p_srcB++; inB1 = _loll(inB); inB2 = _hill(inB); /* multiply mul = sourceA * sourceB */ _smpy2_hi_lo(inA1, inB1, &out_hi1, &out_lo1); _smpy2_hi_lo(inA2, inB2, &out_hi2, &out_lo2); /* store the result */ _amem8(p_dst) = _itoll(_packh2(out_hi2, out_lo2), _packh2(out_hi1, out_lo1)); p_dst++; /* Decrement the blockSize loop counter */ blkCnt--; } }