CoMD – ECP Proxy Applications

From README.md

CoMD is a reference implementation of typical classical molecular dynamics algorithms and workloads. It is created and maintained by ExMatEx: Exascale Co-Design Center for Materials in Extreme Environments (exmatex.org). The code is intended to serve as a vehicle for co-design by allowing others to extend and/or reimplement it as needed to test performance of new architectures, programming models, etc.

Analysis

Parameters

Compiler = icc (ICC) 18.0.1 20171018
Build_Flags = -std=c99 -g -O3 -march=native -qopenmp -DDOUBLE
Run_Parameters = -x 128 -y 128 -z 50

Scaling

Performance Improvement

Threads	2	4	8	16	32	56	112
Speed Up	1.97X	1.92X	1.91X	1.70X	1.58X	1.40X	1.09X

Hit Locations

FLOPS

Double Precision	Scalar	128B Packed	256B Packed	512B Packed	Total FLOPS	GFLOPS/sec
PMU	1.750e+12	9.130e+06	8.810e+07	0.000e+00	1.750e+12	5.327e+01
SDE	1.843e+12	7.252e+06	7.986e+07	0.000e+00	1.843e+12	5.609e+01

Intel Software Development Emulator

Intel SDE	CoMD
Arithmetric Intensity	0.177
FLOPS per Inst	0.56
FLOPS per FP Inst	1.0
Bytes per Load Inst	7.95
Bytes per Store Inst	7.89

@NOTE: Not Vectorized

Roofline – Intel(R) Xeon(R) Platinum 8180M CPU

112 Threads – 56 – Cores 3200.0 Mhz

UOPS Executed

@NOTE: Getting use out of OOO core pipeline

`Experiment Aggregate Metrics`

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (100.0%)	1.46	1.03	1.02	0.14%	49.35%	80.79%	0.50%	2.01%	2.17%
56 (100.0%)	0.85	0.59	0.58	0.12%	47.65%	82.64%	0.26%	2.48%	7.13%
112 (100.0%)	1.07	0.38	0.37	0.15%	38.68%	76.07%	0.43%	3.22%	9.09%

`ljForce`

145 int ljForce(SimFlat* s)
146 {
147   LjPotential* pot = (LjPotential *) s->pot;
148   real_t sigma = pot->sigma;
149   real_t epsilon = pot->epsilon;
150   real_t rCut = pot->cutoff;
151   real_t rCut2 = rCut*rCut;
152
153   // zero forces and energy
154   real_t ePot = 0.0;
155   s->ePotential = 0.0;
156   int fSize = s->boxes->nTotalBoxes*MAXATOMS;
157   #pragma omp parallel for
158   for (int ii=0; ii<fSize; ++ii) 
159   { 
160     zeroReal3(s->atoms->f[ii]);
161     s->atoms->U[ii] = 0.;
162   }
163
164   real_t s6 = sigma*sigma*sigma*sigma*sigma*sigma;
165
166   real_t rCut6 = s6 / (rCut2*rCut2*rCut2);
167   real_t eShift = POT_SHIFT * rCut6 * (rCut6 - 1.0);
168
169   int nNbrBoxes = 27;
170

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (95.4%)	1.47	1.04	1.04	0.08%	34.51%	47.56%	0.30%	0.82%	0.87%
56 (58.3%)	1.20	0.84	0.84	0.07%	33.20%	66.07%	0.21%	1.38%	4.06%
112 (49.0%)	1.64	0.58	0.58	0.11%	34.28%	57.56%	0.44%	2.94%	8.06%

171   // loop over local boxes
172   #pragma omp parallel for reduction(+:ePot)
173   for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++)
174   {
175     int nIBox = s->boxes->nAtoms[iBox];
176
177     // loop over neighbors of iBox
178     for (int jTmp=0; jTmp<nNbrBoxes; jTmp++) 
179     { 
180       int jBox = s->boxes->nbrBoxes[iBox][jTmp];
181
182       assert(jBox>=0);
183
184       int nJBox = s->boxes->nAtoms[jBox];
185
186       // loop over atoms in iBox
187       for (int iOff=MAXATOMS*iBox; iOff<(iBox*MAXATOMS+nIBox); iOff++)
188       {
189
190         // loop over atoms in jBox
191         for (int jOff=jBox*MAXATOMS; jOff<(jBox*MAXATOMS+nJBox); jOff++)
192         {
193           real3 dr;
194           real_t r2 = 0.0;
195           for (int m=0; m<3; m++) 
196           { 
197             dr[m] = s->atoms->r[iOff][m]-s->atoms->r[jOff][m];
198             r2+=dr[m]*dr[m];
199           }
200
201           if ( r2 <= rCut2 && r2 > 0.0)
202           {
203
204             // Important note:
205             // from this point on r actually refers to 1.0/r
206             r2 = 1.0/r2;
207             real_t r6 = s6 * (r2*r2*r2);
208             real_t eLocal = r6 * (r6 - 1.0) - eShift;
209             s->atoms->U[iOff] += 0.5*eLocal;
210             ePot += 0.5*eLocal;
211
212             // different formulation to avoid sqrt computation
213             real_t fr = - 4.0*epsilon*r6*r2*(12.0*r6 - 6.0);
214             for (int m=0; m<3; m++) 
215             { 
216               s->atoms->f[iOff][m] -= dr[m]*fr;
217             }
218           }
219         } // loop over atoms in jBox
220       } // loop over atoms in iBox
221     } // loop over neighbor boxes
222   } // loop over local boxes in system
223
224   ePot = ePot*4.0*epsilon;
225   s->ePotential = ePot;
226
227   return 0;
228 }