While compiling the below code with O3 compilation option with compiler version 7.3.5, I still see 6 NOPs in the assembly code for the loop.
void function(long long unsigned * restrict x1,
long long unsigned * restrict x2,
long long unsigned * restrict multArr,
long long unsigned * restrict y,
int nofLoops,
char shrVal) {
#pragma MUST_ITERATE(6, 600, 6)
for (int i=0; i < nofLoops; i++) {
long long unsigned int x1_val = _amem8(x1++);
long long unsigned int x2_val = _amem8(x2++);
long long unsigned int mult1 = _amem8(multArr++);
long long unsigned int mult2 = _amem8(multArr++);
long long unsigned int prod1 = _lo128(_cmatmpy(mult1, _llto128(x2_val, x1_val)));
long long unsigned int prod2 = _hi128(_cmatmpy(mult2, _llto128(x2_val, x1_val)));
unsigned int res1 = _sshvr(_loll(prod1), shrVal);
unsigned int res2 = _sshvr(_hill(prod1), shrVal);
unsigned int res3 = _sshvr(_loll(prod2), shrVal);
unsigned int res4 = _sshvr(_hill(prod2), shrVal);
_amem8(y++) = _itoll(_spack2(res4, res3), _spack2(res2, res1));
}
}
Loop unrolling does not improve cycle count. Can anyone help me on how to improve the cycle count for this loop? Below is the assembly code:
1423 ;*----------------------------------------------------------------------------*
1424 ;* SOFTWARE PIPELINE INFORMATION
1430 ;* Loop Unroll Multiple : 2x
1431 ;* Known Minimum Trip Count : 3
1432 ;* Known Maximum Trip Count : 300
1433 ;* Known Max Trip Count Factor : 3
1434 ;* Loop Carried Dependency Bound(^) : 0
1435 ;* Unpartitioned Resource Bound : 6
1436 ;* Partitioned Resource Bound(*) : 6
1437 ;* Resource Partition:
1438 ;* A-side B-side
1439 ;* .L units 0 0
1440 ;* .S units 2 2
1441 ;* .D units 5 5
1442 ;* .M units 6* 6*
1443 ;* .X cross paths 2 2
1444 ;* .T address paths 5 5
1445 ;* Long read paths 0 0
1446 ;* Long write paths 0 0
1447 ;* Logical ops (.LS) 6 6 (.L or .S unit)
1448 ;* Addition ops (.LSD) 0 0 (.L or .S or .D unit)
1449 ;* Bound(.L .S .LS) 4 4
1450 ;* Bound(.L .S .D .LS .LSD) 5 5
1451 ;*
1452 ;* Searching for software pipeline schedule at ...
1453 ;* ii = 6 Did not find schedule
1454 ;* ii = 7 Schedule found with 4 iterations in parallel
1455 ;* Done
1456 ;*
1457 ;* Loop will be splooped
1458 ;* Collapsed epilog stages : 0
1459 ;* Collapsed prolog stages : 0
1460 ;* Minimum required memory pad : 0 bytes
1461 ;*
1462 ;* For further improvement on this loop, try option -mh32
1463 ;*
1464 ;* Minimum safe trip count : 1 (after unrolling)
1465 ;*----------------------------------------------------------------------------*
1466 00000120 $C$L12: ; PIPED LOOP PROLOG
1468
1469 00000120 0303a001 SPLOOPD 7 ;28 ; (P)
1470 00000124 0d110059 || ADD .L1 8,A4,A26
1471 00000128 06e013a2 || MVC .S2X A24,ILC
1472
1473 ;** --------------------------------------------------------------------------*
1474 0000012c $C$L13: ; PIPED LOOP KERNEL
1478 0000012c 2ce7 SPMASK L1,L2
1479 00000130 0d911059 || ADD .L1X 8,B4,A27
1480 0000012e 4e47 || MV .L2 B4,B26
1481 00000134 0b685765 || LDDW .D1T1 *A26++(16),A23:A22 ; |213| (P) <0,0>
1482 00000138 006457e6 || LDDW .D2T2 *B25++(16),B1:B0 ; |213| (P) <0,0>
1483
1484 00000140 096857e7 LDDW .D2T2 *B26++(16),B19:B18 ; |215| (P) <0,1>
1485 00000144 0e6c5764 || LDDW .D1T1 *A27++(16),A29:A28 ; |215| (P) <0,1>
1487 00000148 2ce6 SPMASK L2
1488 0000014a 7dc7 || MV .L2X A3,B27
1489
1490 0000014c 2fe7 SPMASK L1,S1,L2,S2
1491 0000014e 9dc7 || MV .L2X A3,B28
1492 00000150 cdc6 || MV .L1 A3,A30
1493 00000152 0dae || ADD .S1 8,A3,A3
1494 00000154 0d800c52 || ADDK .S2 24,B27
1495
1496 00000158 000f0001 SPMASK L1,L2
1497 00000160 0ca01fd9 || MV .L1X B8,A25
1498 00000164 0c200fdb || MV .L2 B8,B24
1499 00000168 020c9765 || LDDW .D1T1 *A3++(32),A5:A4 ; |231| (P) <0,4>
1500 0000016c 046c97e6 || LDDW .D2T2 *B27++(32),B9:B8 ; |231| (P) <0,4>
1501
1502 00000170 00230001 SPMASK S2
1503 00000174 0e000852 || ADDK .S2 16,B28
1504
1505 00000178 0a789765 LDDW .D1T1 *A30++(32),A21:A20 ; |231| (P) <0,6>
1506 0000017c 087097e7 || LDDW .D2T2 *B28++(32),B17:B16 ; |231| (P) <0,6>
1507 00000180 09002d9b || DMV .L2 B1,B0,B19:B18 ; |213| (P) <0,6>
1508 00000184 025aed99 || DMV .L1 A23,A22,A5:A4 ; |213| (P) <0,6>
1509 00000188 044a6ef3 || DMV .S2 B19,B18,B9:B8 ; |215| (P) <0,6>
1510 0000018c 0473aef0 || DMV .S1 A29,A28,A9:A8 ; |215| (P) <0,6>
1511
1512 00000190 2f66 SPMASK S1,S2
1513 00000194 0c1911a1 || ADD .S1X 8,B6,A24
1514 00000192 af4f || MV .S2 B6,B29
1515 00000198 03212d9b || DMV .L2 B9,B8,B7:B6 ; |223| (P) <0,7>
1516 000001a0 05212d98 || DMV .L1 A9,A8,A11:A10 ; |223| (P) <0,7>
1517
1518 000001a4 0410ad99 DMV .L1 A5,A4,A9:A8 ; |223| (P) <0,8>
1519 000001a8 024a6d9a || DMV .L2 B19,B18,B5:B4 ; |223| (P) <0,8>
1520
1521 000001ac 0318045b DADD .L2 0,B7:B6,B7:B6 ; |231| (P) <0,9>
1522 000001b0 02100863 || DADD .S2 0,B5:B4,B5:B4 ; |231| (P) <0,9>
1523 000001b4 09280459 || DADD .L1 0,A11:A10,A19:A18 ; |231| (P) <0,9>
1524 000001b8 08200861 || DADD .S1 0,A9:A8,A17:A16 ; |231| (P) <0,9>
1525 000001bc 12211201 || CMATMPY .M1X B9:B8,A11:A10:A9:A8,A7:A6:A5:A4 ; |231| (P) <0,9>
1526 000001c0 1a109202 || CMATMPY .M2X A5:A4,B7:B6:B5:B4,B23:B22:B21:B20 ; |231| (P) <0,9>
1527
1528 000001c4 00006000 NOP 4
1529
1530 000001c8 14421201 CMATMPY .M1X B17:B16,A19:A18:A17:A16,A11:A10:A9:A8 ; |231| (P) <0,14>
1531 000001cc 14509202 || CMATMPY .M2X A21:A20,B7:B6:B5:B4,B11:B10:B9:B8 ; |231| (P) <0,14>
1532
1533 000001d0 0fdf06b3 SSHVR .M2 B23,B24,B31 ; |231| (P) <0,15>
1534 000001d4 0f9f26b0 || SSHVR .M1 A7,A25,A31 ; |231| (P) <0,15>
1535
1536 000001d8 00000000 NOP 1
1537
1538 000001dc 001b26b1 SSHVR .M1 A6,A25,A0 ; |231| (P) <0,17>
1539 000001e0 0f5b06b2 || SSHVR .M2 B22,B24,B30 ; |231| (P) <0,17>
1541 000001e4 00000000 NOP 1
1542
1543 000001e8 0b2726b1 SSHVR .M1 A9,A25,A22 ; |231| (P) <0,19>
1544 000001ec 0a2306b2 || SSHVR .M2 B8,B24,B20 ; |231| (P) <0,19>
1545
1546 000001f0 042326b1 SSHVR .M1 A8,A25,A8 ; |231| (P) <0,20>
1547 000001f4 022706b2 || SSHVR .M2 B9,B24,B4 ; |231| (P) <0,20>
1548
1549 000001f8 08fbecb3 SPACK2 .S2 B31,B30,B17 ; |231| <0,21>
1550 000001fc 0883ecb0 || SPACK2 .S1 A31,A0,A17 ; |231| <0,21>
1551
1552 00000200 0822ccb1 SPACK2 .S1 A22,A8,A16 ; |231| <0,22>
1553 00000204 08508cb2 || SPACK2 .S2 B4,B20,B16 ; |231| <0,22>
1556
1557 00000208 04c34001 SPKERNEL 2,3
1558 0000020c 087457c7 || STDW .D2T2 B17:B16,*B29++(16) ; |231| <0,23>
1559 00000210 08605744 || STDW .D1T1 A17:A16,*A24++(16) ; |231| <0,23>
1560
1562 ;** --------------------------------------------------------------------------*
1563 00000214 $C$L14: ; PIPED LOOP EPILOG
1564 ;** --------------------------------------------------------------------------*