From README
PENNANT is an unstructured mesh physics mini-app designed for advanced architecture research. It contains mesh data structures and a few physics algorithms adapted from the LANL rad-hydro code FLAG, and gives a sample of the typical memory access patterns of FLAG.
Problem Size Discussion
From README.apex
The APEX RFP defines problem sizes to be used in benchmarking. The PENNANT test problems corresponding to this definition are:
- “Small” problem: leblancbig
- “Medium” problem: leblancx4
- “Large” problem: leblancx16
- “Grand” problem: leblancx64
In addition, the nohpoly problem will be used for verifying that the baseline code functionality has been preserved (see below). Note for reference that nohpoly would also be considered a “small” problem by the RFP definition.
Analysis
Build and Run Information
Compiler = 'clang 5.0.1'
Build_Flags = '-g -O3 -march=native -fopenmp'
Run_Flags = '../test/leblancbig/leblancbig.pnt'
leblancbig.pnt
tstop 6.0
meshtype rect
meshparams 160 1440 1.0 9.0
subregion 0.0 1.0 3.0 9.0
rinit 1.0
einit 0.1
rinitsub 1.0e-3
einitsub 1.0e-7
bcx 0.0 1.0
bcy 0.0 9.0
ssmin 0.1
q1 0.3
q2 2.0
dtinit 1.e-3
dtreport 100
chunksize 512
Scaling
Program Aggregate
Threads (Time) |
Inst/Cycle per Core |
L1 DC Miss % |
L2 DC Miss % |
L3 Miss % |
L1 Loads/Cycle per Core |
L2 B/W Used |
L3 B/W Used |
DRAM B/W Used |
1 (100.0%) |
1.4 |
5.5% |
17.5% |
26.4% |
0.76 |
16.2% |
3.8% |
2.1% |
72 (100.0%) |
0.8 |
4.9% |
25.6% |
18.1% |
0.36 |
14.0% |
5.0% |
8.5% |
QCS::setCornerDiv( )
Threads (Time) |
Inst/Cycle per Core |
L1 DC Miss % |
L2 DC Miss % |
L3 Miss % |
L1 Loads/Cycle per Core |
L2 B/W Used |
L3 B/W Used |
DRAM B/W Used |
1 (34.2%) |
1.5 |
2.3% |
14.2% |
1.2% |
0.68 |
6.0% |
1.1% |
0.0% |
72 (13.4%) |
1.5 |
2.6% |
25.1% |
2.5% |
0.62 |
13.0% |
4.5% |
1.1% |
87 // Routine number [2] in the full algorithm
88 // [2.1] Find the corner divergence
89 // [2.2] Compute the cos angle for c
90 // [2.3] Find the evolution factor c0evol(c)
91 // and the Delta u(c) = du(c)
92 void QCS::setCornerDiv(
93 double* c0area,
94 double* c0div,
95 double* c0evol,
96 double* c0du,
97 double* c0cos,
98 const int sfirst,
99 const int slast) {
100
101 const Mesh* mesh = hydro->mesh;
102 const int nums = mesh->nums;
103 const int numz = mesh->numz;
104
105 const double2* pu = hydro->pu;
106 const double2* px = mesh->pxp;
107 const double2* ex = mesh->exp;
108 const double2* zx = mesh->zxp;
109 const double* elen = mesh->elen;
110 const int* znump = mesh->znump;
111
112 int cfirst = sfirst;
113 int clast = slast;
114 int zfirst = mesh->mapsz[sfirst];
115 int zlast = (slast < nums ? mesh->mapsz[slast] : numz);
116
117 double2* z0uc = Memory::alloc<double2>(zlast - zfirst);
118 double2 up0, up1, up2, up3;
119 double2 xp0, xp1, xp2, xp3;
120
121 // [1] Compute a zone-centered velocity
122 fill(&z0uc[0], &z0uc[zlast-zfirst], double2(0., 0.));
123 for (int c = cfirst; c < clast; ++c) {
124 int p = mesh->mapsp1[c];
125 int z = mesh->mapsz[c];
126 int z0 = z - zfirst;
127 z0uc[z0] += pu[p];
128 }
129
130 for (int z = zfirst; z < zlast; ++z) {
131 int z0 = z - zfirst;
132 z0uc[z0] /= (double) znump[z];
133 }
134
135 // [2] Divergence at the corner
136 #pragma ivdep
137 for (int c = cfirst; c < clast; ++c) {
138 int s2 = c;
139 int s = mesh->mapss3[s2];
140 // Associated zone, corner, point
141 int z = mesh->mapsz[s];
142 int z0 = z - zfirst;
143 int c0 = c - cfirst;
144 int p = mesh->mapsp2[s];
145 // Points
146 int p1 = mesh->mapsp1[s];
147 int p2 = mesh->mapsp2[s2];
148 // Edges
149 int e1 = mesh->mapse[s];
150 int e2 = mesh->mapse[s2];
151
152 // Velocities and positions
153 // 0 = point p
154 up0 = pu[p];
155 xp0 = px[p];
156 // 1 = edge e2
157 up1 = 0.5 * (pu[p] + pu[p2]);
158 xp1 = ex[e2];
159 // 2 = zone center z
160 up2 = z0uc[z0];
161 xp2 = zx[z];
162 // 3 = edge e1
163 up3 = 0.5 * (pu[p1] + pu[p]);
164 xp3 = ex[e1];
165
166 // compute 2d cartesian volume of corner
167 double cvolume = 0.5 * cross(xp2 - xp0, xp3 - xp1);
168 c0area[c0] = cvolume;
169
170 // compute cosine angle
171 double2 v1 = xp3 - xp0;
172 double2 v2 = xp1 - xp0;
173 double de1 = elen[e1];
174 double de2 = elen[e2];
175 double minelen = min(de1, de2);
176 c0cos[c0] = ((minelen < 1.e-12) ?
177 0. :
178 4. * dot(v1, v2) / (de1 * de2));
179
180 // compute divergence of corner
181 c0div[c0] = (cross(up2 - up0, xp3 - xp1) -
182 cross(up3 - up1, xp2 - xp0)) /
183 (2.0 * cvolume);
184
185 // compute evolution factor
186 double2 dxx1 = 0.5 * (xp1 + xp2 - xp0 - xp3);
187 double2 dxx2 = 0.5 * (xp2 + xp3 - xp0 - xp1);
188 double dx1 = length(dxx1);
189 double dx2 = length(dxx2);
190
191 // average corner-centered velocity
192 double2 duav = 0.25 * (up0 + up1 + up2 + up3);
193
194 double test1 = abs(dot(dxx1, duav) * dx2);
195 double test2 = abs(dot(dxx2, duav) * dx1);
196 double num = (test1 > test2 ? dx1 : dx2);
197 double den = (test1 > test2 ? dx2 : dx1);
198 double r = num / den;
199 double evol = sqrt(4.0 * cvolume * r);
200 evol = min(evol, 2.0 * minelen);
201
202 // compute delta velocity
203 double dv1 = length2(up1 + up2 - up0 - up3);
204 double dv2 = length2(up2 + up3 - up0 - up1);
205 double du = sqrt(max(dv1, dv2));
206
207 c0evol[c0] = (c0div[c0] < 0.0 ? evol : 0.);
208 c0du[c0] = (c0div[c0] < 0.0 ? du : 0.);
209 } // for s
210
211 Memory::free(z0uc);
212 }
QCS::setQCnForce( )
Threads (Time) |
Inst/Cycle per Core |
L1 DC Miss % |
L2 DC Miss % |
L3 Miss % |
L1 Loads/Cycle per Core |
L2 B/W Used |
L3 B/W Used |
DRAM B/W Used |
1 (11.6%) |
1.0 |
5.7% |
13.9% |
0.0% |
0.71 |
14.3% |
2.7% |
0.0% |
72 (4.4%) |
1.1 |
6.3% |
24.8% |
0.6% |
0.66 |
31.1% |
10.7% |
0.7% |
216 void QCS::setQCnForce(
217 const double* c0div,
218 const double* c0du,
219 const double* c0evol,
220 double2* c0qe,
221 const int sfirst,
222 const int slast) {
223
224 const Mesh* mesh = hydro->mesh;
225
226 const double2* pu = hydro->pu;
227 const double* zrp = hydro->zrp;
228 const double* zss = hydro->zss;
229 const double* elen = mesh->elen;
230
231 int cfirst = sfirst;
232 int clast = slast;
233
234 double* c0rmu = Memory::alloc<double>(clast - cfirst);
235
236 const double gammap1 = qgamma + 1.0;
237
238 // [4.1] Compute the c0rmu (real Kurapatenko viscous scalar)
239 // #pragma ivdep
240 for (int c = cfirst; c < clast; ++c) {
241 int c0 = c - cfirst;
242 int z = mesh->mapsz[c];
243
244 // Kurapatenko form of the viscosity
245 double ztmp2 = q2 * 0.25 * gammap1 * c0du[c0];
246 double ztmp1 = q1 * zss[z];
247 double zkur = ztmp2 + sqrt(ztmp2 * ztmp2 + ztmp1 * ztmp1);
248 // Compute c0rmu for each corner
249 double rmu = zkur * zrp[z] * c0evol[c0];
250 c0rmu[c0] = ((c0div[c0] > 0.0) ? 0. : rmu);
251
252 } // for c
253
254 // [4.2] Compute the c0qe for each corner
255 //#pragma ivdep
256 for (int c = cfirst; c < clast; ++c) {
257 int s4 = c;
258 int s = mesh->mapss3[s4];
259 int c0 = c - cfirst;
260 int p = mesh->mapsp2[s];
261 // Associated point and edge 1
262 int p1 = mesh->mapsp1[s];
263 int e1 = mesh->mapse[s];
264 // Associated point and edge 2
265 int p2 = mesh->mapsp2[s4];
266 int e2 = mesh->mapse[s4];
267
268 // Compute: c0qe(1,2,3)=edge 1, y component (2nd), 3rd corner
269 // c0qe(2,1,3)=edge 2, x component (1st)
270 c0qe[2 * c0] = c0rmu[c0] * (pu[p] - pu[p1]) / elen[e1];
271 c0qe[2 * c0 + 1] = c0rmu[c0] * (pu[p2] - pu[p]) / elen[e2];
272
273 } // for s
274
275 Memory::free(c0rmu);
276 }
Mesh::calcCtrs( )
Threads (Time) |
Inst/Cycle per Core |
L1 DC Miss % |
L2 DC Miss % |
L3 Miss % |
L1 Loads/Cycle per Core |
L2 B/W Used |
L3 B/W Used |
DRAM B/W Used |
1 (6.1%) |
1.2 |
7.9% |
21.2% |
24.7% |
0.84 |
25.8% |
7.3% |
3.8% |
72 (7.9%) |
0.44 |
7.6% |
26.1% |
31.9% |
0.23 |
14.7% |
5.3% |
15.9% |
445 void Mesh::calcCtrs(
446 const double2* px,
447 double2* ex,
448 double2* zx,
449 const int sfirst,
450 const int slast) {
451
452 int zfirst = mapsz[sfirst];
453 int zlast = (slast < nums ? mapsz[slast] : numz);
454 fill(&zx[zfirst], &zx[zlast], double2(0., 0.));
455
456 for (int s = sfirst; s < slast; ++s) {
457 int p1 = mapsp1[s];
458 int p2 = mapsp2[s];
459 int e = mapse[s];
460 int z = mapsz[s];
461 ex[e] = 0.5 * (px[p1] + px[p2]);
462 zx[z] += px[p1];
463 }
464
465 for (int z = zfirst; z < zlast; ++z) {
466 zx[z] /= (double) znump[z];
467 }
468
469 }
Mesh::calcVols( )
Threads (Time) |
Inst/Cycle per Core |
L1 DC Miss % |
L2 DC Miss % |
L3 Miss % |
L1 Loads/Cycle per Core |
L2 B/W Used |
L3 B/W Used |
DRAM B/W Used |
1 (7.3%) |
1.7 |
2.9% |
21.1% |
15.9% |
0.82 |
10.6% |
3.0% |
1.0% |
72 (7.6%) |
0.84 |
4.9% |
25.5% |
14.4% |
0.27 |
12.4% |
4.4% |
5.9% |
472 void Mesh::calcVols(
473 const double2* px,
474 const double2* zx,
475 double* sarea,
476 double* svol,
477 double* zarea,
478 double* zvol,
479 const int sfirst,
480 const int slast) {
481
482 int zfirst = mapsz[sfirst];
483 int zlast = (slast < nums ? mapsz[slast] : numz);
484 fill(&zvol[zfirst], &zvol[zlast], 0.);
485 fill(&zarea[zfirst], &zarea[zlast], 0.);
486
487 const double third = 1. / 3.;
488 int count = 0;
489 for (int s = sfirst; s < slast; ++s) {
490 int p1 = mapsp1[s];
491 int p2 = mapsp2[s];
492 int z = mapsz[s];
493
494 // compute side volumes, sum to zone
495 double sa = 0.5 * cross(px[p2] - px[p1], zx[z] - px[p1]);
496 double sv = third * sa * (px[p1].x + px[p2].x + zx[z].x);
497 sarea[s] = sa;
498 svol[s] = sv;
499 zarea[z] += sa;
500 zvol[z] += sv;
501
502 // check for negative side volumes
503 if (sv <= 0.) count += 1;
504
505 } // for s
506
507 if (count > 0) {
508 #pragma omp atomic
509 numsbad += count;
510 }
511
512 }