Hello all.
I am optimizing the following function in assembly where the height and width is fixed at 2 in the assembly code. I have compared the cycle count, the cycles of the assembly function are significantly less than the C Function when called for a 2x2 block. The cycle count was taken in CCS 3.3 with the clock in profile menu. The assembly code is for davinci Dm6467
void mc_chroma( unsigned char *src , int32_t i_src_stride , unsigned char *dst , int32_t i_dst_stride, int32_t mvx , int32_t mvy , int32_t i_width , int32_t i_height )
{
unsigned char *srcp;
int32_t x, y;
const int32_t d8x = mvx&0x07;
const int32_t d8y = mvy&0x07;
const int32_t cA = (8-d8x)*(8-d8y);
const int32_t cB = d8x *(8-d8y);
const int32_t cC = (8-d8x)*d8y;
const int32_t cD = d8x *d8y;
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
srcp = &src[i_src_stride];
for( y = 0; y < i_height; y++ )
{
for( x = 0; x < i_width; x++ )
{
dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6;
}
dst += i_dst_stride;
src = srcp;
srcp += i_src_stride;
}
}
the assembly for the function is as follows
;;********************************************************************
.global _asm_mc_chroma
.text
_asm_mc_chroma:
|| AND.L2 B8,0x07,B31 ;; d8y = mvy&0x07
|| SHR.S2 B8,3,B9 ;; mvy >> 3
|| SHR.S1 A8,3,A9 ;; mvx >> 3
|| CMPGT.L1 A10,4,A0 ;; A0 = 1 Width 8
|| MVK.D2 0,B2
SUB.S1 8,A31,A30 ;; 8-d8x
|| SUB.S2 8,B31,B30 ;; 8-d8y
|| MPY.M2 B9,B4,B9 ;; (mvy >> 3) * i_src_stride
|| MV.D1X B4,A3
||[!A0] CMPEQ.L2 A10,4,B2
MPY.M2X A31,B30,B29 ;; cB = d8x *(8-d8y)
|| MPY.M1X A30,B30,A29 ;; cA = (8-d8x)*(8-d8y)
ADD.L1X B9,A9,A9 ;; (mvy >> 3) * i_src_stride + (mvx >> 3)
|| ADD.D1 A3,A3,A3 ;; Double Sourece Stride
|| MPY.M2X A31,B31,B28 ;; cD = d8x *d8y
MPY.M1X A30,B31,A28 ;; cC = (8-d8x)*d8y
|| ADD.D1 A4,A9,A4 ;; src += (mvy >> 3) * i_src_stride + (mvx >> 3);
|| ADD.L1 A4,A9,A2 ;; Save copy of source pointer
|| MVKL.S1 0x01010101,A24
|| MVKL.S2 0x01010101,B24
|| MV.L2 B4,B2
MVKH.S1 0x01010101,A24
|| MVKH.S2 0x01010101,B24
|| MV.L1 A6,A1
|| LDNDW.D1 *A4++(A3),A17:A16 ;; Load ROW 1 8-bytes
|| ADD.L2X A4,B4,B5 ;; Adjusted Pointer for next row
|| ADD.D2 B4,B4,B4 ;; Double Sourece Stride
MV.L1 B10,A0
|| LDNDW.D2 *B5++(B4),B17:B16 ;; Load ROW 2 8-bytes
|| MPY32.M1 A24,A29,A31 ;; created cA,cA,cA,cA
|| MPY32.M2 B24,B29,B31 ;; created cB,cB,cB,cB
LDNDW.D1 *A4++(A3),A19:A18 ;; Load ROW 3 8-bytes
|| MPY32.M1 A24,A28,A30 ;; created cC,cC,cC,cC
|| MPY32.M2 B24,B28,B30 ;; created cD,cD,cD.cD
LDNDW.D2 *B5++(B4),B19:B18 ;; Load ROW 4 8-bytes
|| SHR.S1X B6,2,A5 ;; created stride for dest stride
;;;;;;;;;;;;;;;;;;;;;;;; 2x2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
LDNDW.D1 *A4++(A3),A27:A26 ;; Load ROW 5 8-bytes
SHR.S1 A16,8,A17
CHROMA_W2:
MPYUS4.M1 A16,A31,A21:A20 ;;1 d1*cA,c1*cA,b1*cA,a1*cA
|| MPYUS4.M2X A17,B31,B21:B20 ;;1 d1*cB,c1*cB,b1*cB,a1*cB2
|| MVKL.S1 0x00200020,A3 ;; moved low bytes of 32:32
|| SHR.S2 B16,8,B17
MPYUS4.M1X B16,A30,A23:A22 ;;1 d2*cC,c2*cC,b2*cC,a2*cC
|| MPYUS4.M2 B17,B30,B23:B22 ;;1 d2*cD,c2*cD,b2*cD,a2*cD
|| MVKH.S1 0x00200020,A3 ;; moved high bytes of 32:32
MPYUS4.M1X B16,A31,A21:A20 ;;2 d2*cA,c2*cA,b2*cA,a2*cA
|| MPYUS4.M2 B17,B31,B21:B20 ;;2 d2*cB,c2*cB,b2*cB,a2*cB
|| SHR.S1 A18,8,A19
MPYUS4.M1 A18,A30,A23:A22 ;;2 d3*cC,c3*cC,b3*cC,a3*cC
|| MPYUS4.M2X A19,B30,B23:B22 ;;2 d3*cD,c3*cD,b3*cD,a3*cD
ADD2.D1X A20,B20,A24 ;;1 ADD cAb1:cAa1 + cCb2:cCa2
|| MVK.S1 0,A2
ADD2.D2X A22,B22,B24 ;;1 ADD cBc1:cBb1 + cDc2:cDb2
|| ADD2.D1 A24,A3,A24 ;;1 Added 32
|| SHR.S1 B6,1,A9
LOOP_chroma_Hi4:
ADD2.D1X A24,B24,A24 ;;1 aded the result of row1
|| SHR.S1 A26,8,A27
|| SUB.L1 A0,2,A0
|| SHR.S2 B18,8,B19
ADD2.L1 A20,A22,A28 ;;2 ADD cAb2:cAa2 + cCb3:cCa3
|| ADD2.D2 B20,B22,B27 ;;2 ADD cBc2:cBb2 + cDc3:cDb3
|| SHR2.S1 A24,6,A24 ;;1 shift right by 6
|| MPYUS4.M1 A18,A31,A21:A20 ;;3 d1*cA,c1*cA,b1*cA,a1*cA
|| MPYUS4.M2X A19,B31,B21:B20 ;;3 d1*cB,c1*cB,b1*cB,a1*cB2
||[A0] B.S2 LOOP_chroma_Hi4
ADD2.D1 A28,A3,A28 ;;2 Added 32
|| SPACKU4.S1 A2,A24,A25 ;;1 packed result
|| MPYUS4.M1X B18,A30,A23:A22 ;;3 d2*cC,c2*cC,b2*cC,a2*cC
|| MPYUS4.M2 B19,B30,B23:B22 ;;3 d2*cD,c2*cD,b2*cD,a2*cD
||[!A0]B.S2 B3
STH.D1 A25,*A6++[A9] ;;1 stored first row
|| ADD2.L2X A28,B27,B27 ;;2 added the results of row 2
|| MPYUS4.M1X B18,A31,A21:A20 ;;4 d2*cA,c2*cA,b2*cA,a2*cA
|| MPYUS4.M2 B19,B31,B21:B20 ;;4 d2*cB,c2*cB,b2*cB,a2*cB
SHR2.S2 B27,6,B27 ;;2 shift right by 6
|| MPYUS4.M1 A26,A30,A23:A22 ;;4 d3*cC,c3*cC,b3*cC,a3*cC
|| MPYUS4.M2X A27,B30,B23:B22 ;;4 d3*cD,c3*cD,b3*cD,a3*cD
SPACKU4.S1X A2,B27,A27 ;;2 packed result
|| ADD2.D2X A20,B20,B25 ;;1 ADD cAb1:cAa1 + cCb2:cCa2
STH.D1 A27,*A6++[A9] ;;2 stored first row
|| ADD2.D2X A22,B22,B24 ;;1 ADD cBc1:cBb1 + cDc2:cDb2
|| ADD2.L1X B25,A3,A24 ;;1 Added 32
NOP
.end