> Ex: as long as i, i+1, i+2, i+3, ... i+7 are not dependent on each other, you can vectorize to SIMD-width 8.
Do you mean like this? I get this to about as fast as the first "unoptimized" version in the SO post, but not faster.
void compute()
{
const double A = 1.1, B = 2.2, C = 3.3;
const double A128 = 128*A;
double Y[8], Z[8];
Y[0] = C;
Y[1] = A + B + C;
Y[2] = 4*A + 2*B + C;
Y[3] = 9*A + 3*B + C;
Y[4] = 16*A + 4*B + C;
Y[5] = 25*A + 5*B + C;
Y[6] = 36*A + 6*B + C;
Y[7] = 49*A + 7*B + C;
Z[0] = 64*A + 8*B;
Z[1] = 80*A + 8*B;
Z[2] = 96*A + 8*B;
Z[3] = 112*A + 8*B;
Z[4] = 128*A + 8*B;
Z[5] = 144*A + 8*B;
Z[6] = 160*A + 8*B;
Z[7] = 176*A + 8*B;
int i;
for(i=0; i<LEN; i+=8) {
data[i ] = Y[0];
data[i+1] = Y[1];
data[i+2] = Y[2];
data[i+3] = Y[3];
data[i+4] = Y[4];
data[i+5] = Y[5];
data[i+6] = Y[6];
data[i+7] = Y[7];
Y[0] += Z[0];
Y[1] += Z[1];
Y[2] += Z[2];
Y[3] += Z[3];
Y[4] += Z[4];
Y[5] += Z[5];
Y[6] += Z[6];
Y[7] += Z[7];
Z[0] += A128;
Z[1] += A128;
Z[2] += A128;
Z[3] += A128;
Z[4] += A128;
Z[5] += A128;
Z[6] += A128;
Z[7] += A128;
}
}