Listing 7.

void matmul2(int dim, float *a, float *b,
   float *c)
 {
  float dot0, dot1, dot2, dot3, a0, a1, a2,
        a3, b0, b1, b2, b3;
  float dot4, dot5, dot6, dot7, a4, a5, a6,
        a7, b4, b5, b6, b7;
  long I, J, i, j, k;

/*  ...transpose b into tb like in matmul1...; */

  for (I = 0; I < dim*dim; I += dim)
     for (j = J = 0; j < dim; ++j, J += dim)
     {
        dot0 = dot1 = dot2 = dot3 = dot4 = dot5 =
             dot6 = dot7 = 0.0;
        for (k = 0; k < dim; k += 8)
        {
           a0 =  a[I + k + 0];
           b0 = tb[J + k + 0];
           a1 =  a[I + k + 1];
           b1 = tb[J + k + 1];
           a2 =  a[I + k + 2];
           b2 = tb[J + k + 2];
           a3 =  a[I + k + 3];
           b3 = tb[J + k + 3];
           a4 =  a[I + k + 4];
           b4 = tb[J + k + 4];
           a5 =  a[I + k + 5];
           b5 = tb[J + k + 5];
           a6 =  a[I + k + 6];
           b6 = tb[J + k + 6];
           a7 =  a[I + k + 7];
           b7 = tb[J + k + 7];
           dot0 += a0 * b0;
           dot1 += a1 * b1;
           dot2 += a2 * b2;
           dot3 += a3 * b3;
           dot4 += a4 * b4;
           dot5 += a5 * b5;
           dot6 += a6 * b6;
           dot7 += a7 * b7;
}
        c[I + j] = dot0 + dot1 + dot2 + dot3 +
                   dot4 + dot5 + dot6 + dot7;
     }
}