Alternative solution (tune-1a)¶
An equivalent SIMD conversion for the i-loop without using a compiler-specific directive is as follows. We split the loop into several parts and change the order of loops so that the i-loop comes to the innermost. Although the editing of the code is extensive, similar methods can be used in other applications.
axhelm-1a.cpp¶
79 for(int k = 0; k < p_Nq; k++) for(int j = 0; j < p_Nq; ++j)
80 {
81 for(int i = 0; i < p_Nq; ++i) {
82 qr[i] = 0.f;
83 qs[i] = 0.f;
84 qt[i] = 0.f;
85 }
86 for(int m = 0; m < p_Nq; m++) {
87 for(int i = 0; i < p_Nq; ++i) {
88 qr[i] += s_D[i][m] * s_q[k][j][m];
89 qs[i] += s_D[j][m] * s_q[k][m][i];
90 qt[i] += s_D[k][m] * s_q[m][j][i];
91 }
92 }
93
94 for(int i = 0; i < p_Nq; ++i) {
95 // const dlong id = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np;
96 const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
97 const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np];
98 const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np];
99 const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np];
100 const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np];
101 const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np];
102 const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np];
103
104 s_Gqr[k][j][i] = r_G00 * qr[i] + r_G01 * qs[i] + r_G02 * qt[i];
105 s_Gqs[k][j][i] = r_G01 * qr[i] + r_G11 * qs[i] + r_G12 * qt[i];
106 s_Gqt[k][j][i] = r_G02 * qr[i] + r_G12 * qs[i] + r_G22 * qt[i];
107 }
108 }
109
110 for(int k = 0; k < p_Nq; ++k) for(int j = 0; j < p_Nq; ++j)
111 {
112 for(int i = 0; i < p_Nq; ++i) {
113 qr[i] = 0.f;
114 qs[i] = 0.f;
115 qt[i] = 0.f;
116 }
117 for(int m = 0; m < p_Nq; m++) {
118 for(int i = 0; i < p_Nq; ++i) {
119 qr[i] += s_D[m][i] * s_Gqr[k][j][m];
120 qs[i] += s_D[m][j] * s_Gqs[k][m][i];
121 qt[i] += s_D[m][k] * s_Gqt[m][j][i];
122 }
123 }
124 for(int i = 0; i < p_Nq; ++i) {
125 const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i;
126 Aq[id] = qr[i] + qs[i] + qt[i];
127 }
128 }
129 }
130}
For both i-loops, the loop body is split into three parts,
and the middle part comes to the inner of the m-loop.
Note that the datatype of intermediate variables qr, qs, qt
is changed from a scaler to an array with size 8.
Not the same as tune-1, but this version yields a performance of 321 GFLOP/s.