I believe the C66 compiler 7.4.2 is making some bad choices wrt MPYSPDP and MPYSP2DP. This might be a carry over from the odd way the C67 compiler handles:
float x, y;
double z0 = x * y; // uses MPYSP and SPDP
double z1 = (double) x * y; // uses MPYSP2DP
Here's a stripped down example to demonstrate-- it could probably be stripped down more.
#pragma FUNC_INTERRUPT_THRESHOLD(-1)
inline float DF2TFilter(double in, double b0, double b1, double b2, double a1, double a2,
double w0, double w1,
double& restrict w0out, double& restrict w1out)
{
double result = b0 * in + w1;
double acc1 = b1 * in;
acc1 += w0;
acc1 -= a1 * result;
w1out = acc1;
double acc0 = b2 * in;
acc0 -= a2 * result;
w0out = acc0;
return result;
}
#pragma FUNC_INTERRUPT_THRESHOLD(-1)
int f(float* restrict outputs,
const float* restrict inputs,
const float* restrict coeffs,
double* restrict states,
int num)
{
const double* restrict inStates = states;
double* restrict outStates = states;
std::_nassert(num > 0);
std::_nassert((intptr_t(coeffs) % 8) == 0);
std::_nassert((intptr_t(inputs) % 8) == 0);
std::_nassert((intptr_t(outputs) % 8) == 0);
for (int i = 0; i < num; ++i)
{
const float* restrict c = &coeffs[6 * i];
double b0 = c[0];
double b1 = c[1];
double b2 = c[2];
double a1 = c[3];
double a2 = c[4];
double in = inputs[i];
const double* restrict s = &inStates[2 * i];
double w0 = s[0];
double w1 = s[1];
double ow0, ow1;
outputs[i] = DF2TFilter(in, b0, b1, b2, a1, a2, w0, w1, ow0, ow1);
outStates[2 * i + 0] = ow0;
outStates[2 * i + 1] = ow1;
}
}
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../main.cpp
;* Loop source line : 42
;* Loop opening brace source line : 43
;* Loop closing brace source line : 58
;* Known Minimum Trip Count : 16
;* Known Maximum Trip Count : 16
;* Known Max Trip Count Factor : 16
;* Loop Carried Dependency Bound(^) : 5
;* Unpartitioned Resource Bound : 10
;* Partitioned Resource Bound(*) : 12
;* Resource Partition:
;* A-side B-side
;* .L units 1 0
;* .S units 2 2
;* .D units 5 4
;* .M units 6 4
;* .X cross paths 4 3
;* .T address paths 6 5
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 3 3 (.L or .S unit)
;* Addition ops (.LSD) 1 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 3 3
;* Bound(.L .S .D .LS .LSD) 4 3
;*
;* Searching for software pipeline schedule at ...
;* ii = 12 Schedule found with 3 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | ** *** ** *** | * **** * |
;* 1: | ** ** ** *** | * **** ** |
;* 2: | * ** ******* | **** * **** |
;* 3: | * * ** ******* | **** * **** |
;* 4: | * ** ******* | **** * **** |
;* 5: | * ** ******* | **** * **** |
;* 6: | **** ******* | **** * ** |
;* 7: | ***** ******* | ****** * ** |
;* 8: | ***** ******* | **** * |
;* 9: | ** ** *** | **** * |
;* 10: | ** ** *** | ****** * ** |
;* 11: | ****** ** *** | **** * **** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 3
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 24 + min_trip_cnt * 12 = 216
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV B9,A17
;* ADD 8,A17,A17
;* MV A21,B7
;* ADDK 36,B7
;* MV A21,A16
;* ADDK 28,A16
;* ADDK 24,A21
;* MV A20,B6
;* ADD 8,A20,A20
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C57:
;* 0 LDW .D1T1 *A21++(24),A3 ; |16|
;* 1 LDW .D2T2 *B8++,B16 ; |55|
;* 2 NOP 3
;* 5 SPDP .S1 A3,A5:A4 ; |16|
;* 6 MV .D1X B16,A3 ; |55| Define a twin register
;* 7 MPYSPDP .M1 A3,A5:A4,A19:A18 ; |16|
;* 8 LDNDW .D1T2 *A16++(24),B17:B16 ; |56|
;* 9 NOP 3
;* 12 SPDP .S2 B16,B19:B18 ; |55|
;* 13 DADD .L1X 0,B17:B16,A5:A4 ; |56| Define a twin register
;* 14 LDDW .D1T1 *A20++(16),A7:A6 ; |16|
;* 15 MPYSPDP .M1X A5,B19:B18,A9:A8 ; |56|
;* 16 MPYSPDP .M2 B16,B19:B18,B19:B18 ; |57|
;* 17 LDDW .D2T2 *B6++(16),B21:B20 ; |57|
;* 18 LDNDW .D2T1 *B7++(24),A7:A6 ; |56|
;* 19 NOP 1
;* 20 FADDDP .L1 A7:A6,A19:A18,A5:A4 ; |16|
;* 21 NOP 2
;* 23 DPSP .L1 A5:A4,A3 ; |55|
;* || DADD .S2X 0,A7:A6,B5:B4 ; |56| Define a twin register
;* || MPYSPDP .M1 A7,A5:A4,A7:A6 ; |56|
;* || FADDDP .L2 B21:B20,B19:B18,B21:B20 ; |57|
;* 24 MPYSPDP .M2X B4,A5:A4,B5:B4 ; |57|
;* 25 NOP 2
;* 27 STW .D1T1 A3,*A22++ ; |55|
;* 28 NOP 2
;* 30 FSUBDP .L1 A9:A8,A7:A6,A5:A4 ; |56|
;* 31 FSUBDP .L2 B21:B20,B5:B4,B5:B4 ; |57|
;* 32 NOP 1
;* 33 STDW .D2T1 A5:A4,*B9++(16) ; |56|
;* 34 STDW .D1T2 B5:B4,*A17++(16) ; |57|
;* || SPBR $C$C57
;* 35 NOP 1
;* 36 ; BRANCHCC OCCURS {$C$C57} ; |42|
;*----------------------------------------------------------------------------*
As you see, even though the arguments to DF2TFilter are double, the compiler is treating them as single. Even the superfluous double b0 = c[0];... doesn't help. This is yielding an ii of 12.
Here's some linear assembly which uses SPDP and fast DP operations and gets an ii of 6.
.global fasm
fasm .cproc outputs, inputs, coeffs, states, num
.no_mdep
.reg thisCoeff, thisState, outputState
.reg input, output
.reg coeffB0L, coeffB0H, coeffB1L, coeffB1H
.reg coeffB2L, coeffB2H
.reg coeffA1L, coeffA1H, coeffA2L, coeffA2H
.reg inH, inL
.reg coeffA1f, coeffA2f
.reg coeffB0f, coeffB1f
.reg coeffB2f, ignore
.reg w0L, w0H
.reg w1L, w1H
.reg filterOutput
.reg temp0L, temp0H, temp1L, temp1H
.reg resultL, resultH
.reg inStates
MV states, inStates
inner: .trip 1
LDDW *coeffs++, coeffA2f:coeffA1f
LDDW *coeffs++, coeffB1f:coeffB0f
LDDW *coeffs++, ignore:coeffB2f
SPDP coeffA1f, coeffA1H:coeffA1L
SPDP coeffA2f, coeffA2H:coeffA2L
SPDP coeffB0f, coeffB0H:coeffB0L
SPDP coeffB1f, coeffB1H:coeffB1L
SPDP coeffB2f, coeffB2H:coeffB2L
LDW *inputs++, input
SPDP input, inH:inL
LDDW *inStates++, w0H:w0L
LDDW *inStates++, w1H:w1L
MV thisState, outputState
FMPYDP coeffB0H:coeffB0L, inH:inL, resultH:resultL
FADDDP resultH:resultL, w1H:w1L, resultH:resultL
FMPYDP coeffB1H:coeffB1L, inH:inL, w1H:w1L
FADDDP w1H:w1L, w0H:w0L, w1H:w1L
FMPYDP coeffA1H:coeffA1L, resultH:resultL, temp1H:temp1L
FSUBDP w1H:w1L, temp1H:temp1L, w1H:w1L
FMPYDP coeffB2H:coeffB2L, inH:inL, w0H:w0L
FMPYDP coeffA2H:coeffA2L, resultH:resultL, temp0H:temp0L
FSUBDP w0H:w0L, temp0H:temp0L, w0H:w0L
DPSP resultH:resultL, filterOutput
STDW w0H:w0L, *states++
STDW w1H:w1L, *states++
STW filterOutput, *outputs++
SUB num, 1, num
[num] B inner
.endproc
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../test.sa
;* Loop source line : 27
;* Loop closing brace source line : 60
;* Known Minimum Trip Count : 1
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 5
;* Unpartitioned Resource Bound : 6
;* Partitioned Resource Bound(*) : 6
;* Resource Partition:
;* A-side B-side
;* .L units 0 1
;* .S units 6* 6*
;* .D units 6* 3
;* .M units 4 1
;* .X cross paths 5 2
;* .T address paths 6* 3
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 4 1 (.L or .S unit)
;* Addition ops (.LSD) 0 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 5 4
;* Bound(.L .S .D .LS .LSD) 6* 4
;*
;* Searching for software pipeline schedule at ...
;* ii = 6 Schedule found with 5 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | ******* ******* ** *** | *** **** ** |
;* 1: | ******* ** **** *** | * ** ** ** |
;* 2: | **** ** ******** *** | ** ** ** ** |
;* 3: | ******* ******** *** | ** ** ** ** |
;* 4: | ******* *************** | * **** ** * |
;* 5: | * **** **** ** *** | * ******* |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 3
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 24 + trip_cnt * 6
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV A30,A25
;* ADD 8,A25,A25
;* MV A28,A24
;* ADD 8,A24,A24
;* MV B20,A3
;* ADDK 16,A3
;* MV B20,B21
;* ADD 8,B21,B21
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C26:
;* 0 LDDW .D2T2 *B21++(24),B23:B22 ; |28|
;* 1 LDW .D1T1 *A29++,A22 ; |36|
;* 2 NOP 1
;* 3 LDDW .D1T1 *A3++(24),A7:A6 ; |29|
;* 4 NOP 1
;* 5 LDDW .D2T2 *B20++(24),B7:B6 ; |27|
;* || SPDP .S2 B22,B17:B16 ; |32|
;* || LDDW .D1T1 *A25++(16),A9:A8 ; |40|
;* 6 SPDP .S1 A22,A21:A20 ; |37| ^
;* 7 NOP 1
;* 8 SPDP .S1 A6,A17:A16 ; |34|
;* || LDDW .D1T1 *A30++(16),A23:A22 ; |39|
;* || FMPYDP .M1X B17:B16,A21:A20,A17:A16 ; |43| ^
;* 9 SPDP .S2 B23,B19:B18 ; |33|
;* 10 SPDP .S1X B7,A5:A4 ; |31|
;* || FMPYDP .M1 A17:A16,A21:A20,A19:A18 ; |49| ^
;* 11 NOP 1
;* 12 FADDDP .L1 A17:A16,A9:A8,A7:A6 ; |44| ^
;* || FMPYDP .M1X B19:B18,A21:A20,A5:A4 ; |45|
;* 13 SPDP .S2 B6,B9:B8 ; |30|
;* 14 NOP 1
;* 15 FMPYDP .M1 A5:A4,A7:A6,A7:A6 ; |50| ^
;* 16 FADDDP .L1 A5:A4,A23:A22,A9:A8 ; |46|
;* || FMPYDP .M2X B9:B8,A7:A6,B5:B4 ; |47|
;* 17 DADD .L2X 0,A7:A6,B5:B4 ; |44| Define a twin register
;* 18 DPSP .L2 B5:B4,B4 ; |52|
;* 19 FSUBDP .L1 A19:A18,A7:A6,A27:A26 ; |51| ^
;* 20 NOP 1
;* 21 FSUBDP .L1X A9:A8,B5:B4,A7:A6 ; |48|
;* 22 STDW .D1T1 A27:A26,*A28++(16) ; |54|
;* || STW .D2T2 B4,*B24++ ; |57|
;* 23 NOP 1
;* 24 STDW .D1T1 A7:A6,*A24++(16) ; |55|
;* || SPBR $C$C26
;* 25 NOP 5
;* 30 ; BRANCHCC OCCURS {$C$C26} ; |60|
;*----------------------------------------------------------------------------*
I'm not sure if there are ever cases where MPYSP2DP/MPYSPDP are a win on the C66, but I've seen one where it was a tie.