I have a fairly simple floating-point matrix-matrix multiply and accumulate function in C. It compiles and runs successfully without optimization. With function (-o2) and file (-o2) level optimization, it produces the incorrect output.
Here is the function:
typedef struct {
int r, c; // number of rows and columns
int capacity; // maximum number of elements that can be stored
// in the memory block pointed at by 'el'
double *el; // storage for elements (row-wise)
} CMAT;
int cmat_mm_mac(CMAT* C, const CMAT* A, const CMAT* B)
{
int _r, _c, id;
// Check inputs:
if (A->c != B->r || C->r != A->r || C->c != B->c) {
return CMAT_ERR_SIZE;
}
// The output cannot be one of the inputs.
if (A == C || B == C) {
return CMAT_ERR_ARGS;
}
for (_r=0; _r < C->r; _r++) // Outer dims
for (_c=0; _c < C->c; _c++) // form new matrix
for (id=0; id < A->c; id++) // Inner dimension
C->el[_r*C->c+_c] += A->el[_r*A->c+id] * B->el[id*B->c+_c];
return 0;
}
Here is a function to test the above:
#include <assert.h>
void testMMac(void)
{
double A[] = {
0.2785, 0.9575, 0.1576,
0.5469, 0.9649, 0.9706
}, B[] = {
0.9572, 0.1419, 0.4854,
0.4218, 0.8003, 0.9157
}, C[4] = { 0 }, Cexpected[] = {
0.8575, 0.5877,
1.7686, 1.3734
};
CMAT mA = { 2, 3, 6, NULL }, mB = { 3, 2, 6, NULL }, mC = { 2, 2, 4, NULL };
int i;
mA.el = A; mB.el = B; mC.el = C;
cmat_mm_mac(&mC, &mA, &mB);
for (i = 0; i < mC.r * mC.c; ++i) {
printf("%.4f ", mC.el[i]);
assert(fabs(mC.el[i] - Cexpected[i]) < TOL);
}
printf("\n\n");
}
The compiler options are -g -ml -mt -v28 --float_support=fpu32
Hope you can help; I hate turning off optimization for specific modules to get around bugs.