miniAero

From Github:

MiniAero is a mini-application for the evaulation of programming models and hardware for next generation platforms. MiniAero is an explicit (using RK4) unstructured finite volume code that solves the compressible Navier-Stokes equations. Both inviscid and viscous terms are included. The viscous terms can be optionally included or excluded.

The three problem types that are accepted in the input file are Sod, Viscous Flat Plate and Inviscid Ramp.

Problem Size

MiniAero reads miniaero.inp from current directory for problem definition.

Analysis

Build and Run Information

Compiler = clang version 7.0.0
Build Flags = -DATOMICS_FLUX -g -march=native -O3
Run Parameters = --kokkos-threads=[#ofThreads]

miniaero.inp:

1 // (Viscous Flat Plate Problem)
2.0 0.002 1.0 0.0 // (Doman max dimensions for x,y,z then ramp angle)
256 512 2 // (Total number of cells in each direction)
400 // (Timesteps)
3e-8 // (Timestep size)
1 // (Yes - Output Results)
100 // (Output frequency)
1 // (Yes - Second order space)
1 // (Yes - Viscous)

Scaling

Intel Software Development Emulator

SDE Metrics	miniAero
Arithmetic Intensity	0.08
Bytes per Load Inst	8.35
Bytes per Store Inst	9.56

Roofline – Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

72 Threads – 36 – Cores 2300.0 Mhz

Experiment Aggregate Metrics

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (100.0%)	0.86	0.28	0.38	3.39%	30.81%	59.64%	5.26%	6.49%	1.96%
72 (100.0%)	0.95	0.17	0.23	4.49%	29.64%	41.40%	8.69%	9.39%	2.30%

`limiter_face stencilLimiter.h`

352 /* limiter_face
353 * functor that computes the limiter value for each face and scatter contribution
354 * to the connected elements. Uses gather-sum or atomics for thread safety.
355 */
356 template
357 struct limiter_face{
358   typedef Device device_type;
359   typedef typename ViewTypes::c_rnd_scalar_field_type scalar_field_type;
360   typedef typename ViewTypes::c_rnd_solution_field_type solution_field_type;
361   typedef typename ViewTypes::c_rnd_face_cell_conn_type face_cell_conn_type;
362   typedef typename ViewTypes::c_rnd_vector_field_type vector_field_type;
363   typedef typename ViewTypes::cell_storage_field_type cell_storage_field_type;
364   typedef typename ViewTypes::c_rnd_gradient_field_type gradient_field_type;
365
366   scalar_field_type cell_volumes_;
367   face_cell_conn_type face_cell_conn_;
368   face_cell_conn_type cell_flux_index_;
369   solution_field_type cell_min_, cell_max_, cell_values_;
370   vector_field_type face_coordinates_, cell_coordinates_;
371   gradient_field_type cell_gradients_;
372   cell_storage_field_type limiter_;
373   Kokkos::View permute_vector_;
374
375   limiter_face(Faces faces, solution_field_type cell_values, Cells cells,
376     gradient_field_type gradients,
377     solution_field_type cell_min, solution_field_type cell_max,
        cell_storage_field_type limiter):
378     face_cell_conn_(faces.face_cell_conn_),
379     cell_flux_index_(faces.cell_flux_index_),
380     cell_min_(cell_min),
381     cell_max_(cell_max),
382     cell_values_(cell_values),
383     face_coordinates_(faces.coordinates_),
384     cell_coordinates_(cells.coordinates_),
385     cell_gradients_(gradients),
386     limiter_(limiter),
387     permute_vector_(faces.permute_vector_)
388   {}
389

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (13.7%)	0.71	0.29	0.35	3.41%	34.77%	51.51%	4.31%	5.78%	2.53%
72 (10.6%)	0.64	0.12	0.14	8.50%	38.18%	44.97%	9.13%	10.46%	5.73%

390 KOKKOS_INLINE_FUNCTION
391 void operator()( const int& ii )const{
392   const int i = permute_vector_(ii);
393   const int left_index = face_cell_conn_(i,0);
394   const int right_index = face_cell_conn_(i,1);
395
396   double conservatives_l[5];
397   double conservatives_r[5];
398   double primitives_l[5];
399   double primitives_r[5];
400
401   for (int icomp = 0; icomp < 5; ++icomp)
402   {
403     if(interior){
404       conservatives_l[icomp] = cell_values_(left_index,icomp);
405       conservatives_r[icomp] = cell_values_(right_index,icomp);
406     }
407     else{
408       conservatives_l[icomp] = cell_values_(left_index,icomp);
409     }
410   }
411
412   if(interior){
413     ComputePrimitives(conservatives_l, primitives_l);
414     ComputePrimitives(conservatives_r, primitives_r);
415   }
416   else{
417     ComputePrimitives(conservatives_l, primitives_l);
418   }
419
420 //Compute left limiter value and compute right limiter value
421
422   double limiter_left[5], limiter_right[5];
423   //compute displacement and distance from cell center to face center.
424   double displacement_l[3];
425   double displacement_r[3];
426   double distance_l = 0;
427   double distance_r = 0;
428   for(int idir = 0; idir < 3; ++idir){
429     displacement_l[idir] = face_coordinates_(i, idir)
                               -cell_coordinates_(left_index, idir);
430     distance_l += displacement_l[idir]*displacement_l[idir];
431     if(interior){
432       displacement_r[idir] = face_coordinates_(i, idir)
                                 -cell_coordinates_(right_index, idir);
433       distance_r += displacement_r[idir]*displacement_r[idir];
434     }
435   }
436
437   double dU_l[5];
438   double dU_r[5];
439   //Extrapolation
440   for(int icomp = 0; icomp < 5; ++icomp){
441     dU_l[icomp] = 0;
442     dU_r[icomp] = 0;
443     for(int idir = 0; idir < 3; ++idir){
444       dU_l[icomp] += displacement_l[idir]*cell_gradients_(left_index, icomp, idir);
445       if(interior)
446         dU_r[icomp] += displacement_r[idir]*cell_gradients_(right_index, icomp, idir);
447     }
448   }
449
450
451   for(int icomp = 0; icomp < 5; ++icomp){
452     double dumax_l = cell_max_(left_index, icomp) - primitives_l[icomp];
453     double dumin_l = cell_min_(left_index, icomp) - primitives_l[icomp];
454
455     limiter_left[icomp] = VenkatLimiter::limit(dumax_l, dumin_l,
                                                   dU_l[icomp], distance_l);
456     if(interior){
457       double dumax_r = cell_max_(right_index, icomp) - primitives_r[icomp];
458       double dumin_r = cell_min_(right_index, icomp) - primitives_r[icomp];
459       limiter_right[icomp] = VenkatLimiter::limit(dumax_r, dumin_r,
                                                      dU_r[icomp], distance_r);
460     }
461   }
462
463 //Then write to memory
464 #ifdef ATOMICS_FLUX
465   for (int icomp = 0; icomp < 5; ++icomp)
466   {
467     double * left_cell_limiter = &limiter_(left_index,0,icomp);
468     bool success=false;
469     do{
470       double old_left_limiter = *left_cell_limiter;
471       double new_left_limiter = MathTools::min(*left_cell_limiter,         
                                                   limiter_left[icomp]);
472       double new_value = Kokkos::atomic_compare_exchange(left_cell_limiter,
                                           old_left_limiter, new_left_limiter);
473       success = new_value == new_left_limiter;
474     } while(!success);
475
476     if(interior){
477       double * right_cell_limiter = &limiter_(right_index,0,icomp);
478       success=false;
479       do{
480         double old_right_limiter = *right_cell_limiter;
481         double new_right_limiter = MathTools::min(*right_cell_limiter,
                                                      limiter_right[icomp]);
482         double new_value = Kokkos::atomic_compare_exchange(right_cell_limiter,
                                             old_right_limiter, new_right_limiter);
483         success = new_value == new_right_limiter;
484       } while(!success);
485     }
486   }
487 #endif
488
489 #ifdef CELL_FLUX
490 for (int icomp = 0; icomp < 5; ++icomp)
491 {
492   limiter_(left_index, cell_flux_index_(i,0), icomp) = limiter_left[icomp];
493
494   if(interior){
495     limiter_(right_index, cell_flux_index_(i,1), icomp) = limiter_right[icomp];
496   }
497 }
498 #endif
499 }

`compute_face_flux flux.h`

43 /* compute_face_flux
44  * functor to compute the internal face flux contributions.
45  * Uses the templated Inviscid and Inviscid flux types to
46  * compute the contribution. This functor organizes
47  * the data to pass to the functions that compute the flux
48  * and puts the flux contribution in the appropriate place
49  * using either Gather-Sum or Atomics for thread safety.
50 */
51
52 template<class Device, bool second_order, class InviscidFluxType,
53     class ViscousFluxType>
54 struct compute_face_flux {
55   typedef Device device_type;
56   typedef typename ViewTypes::c_rnd_solution_field_type solution_field_type;
57   typedef typename ViewTypes::c_rnd_face_cell_conn_type face_cell_conn_type;
58   typedef typename ViewTypes::cell_storage_field_type cell_storage_field_type;
59   typedef typename ViewTypes::c_vector_field_type vector_field_type;
60   typedef typename ViewTypes::c_rnd_gradient_field_type gradient_field_type;
61
62   face_cell_conn_type face_cell_conn_;
63   face_cell_conn_type cell_flux_index_;
64   solution_field_type cell_values_;
65   gradient_field_type cell_gradients_;
66   solution_field_type cell_limiters_;
67   vector_field_type cell_coordinates_;
68   cell_storage_field_type cell_flux_;
69   vector_field_type face_coordinates_, face_normal_, face_tangent_,
70       face_binormal_;
71   Kokkos::View permute_vector_;
72   InviscidFluxType inviscid_flux_evaluator_;
73   ViscousFluxType viscous_flux_evaluator_;
74
75
76   compute_face_flux(Faces faces, solution_field_type cell_values,
77     gradient_field_type cell_gradients, solution_field_type cell_limiters,
78     Cells cells, InviscidFluxType inviscid_flux,
79     ViscousFluxType viscous_flux) :
80     face_cell_conn_(faces.face_cell_conn_), cell_flux_index_(
81       faces.cell_flux_index_), cell_values_(cell_values), cell_gradients_(
82       cell_gradients), cell_limiters_(cell_limiters), cell_coordinates_(
83       cells.coordinates_), cell_flux_(cells.cell_flux_), face_coordinates_(
84       faces.coordinates_), face_normal_(faces.face_normal_), face_tangent_(
85       faces.face_tangent_), face_binormal_(faces.face_binormal_),
         inviscid_flux_evaluator_(
86       inviscid_flux), viscous_flux_evaluator_(viscous_flux) ,
         permute_vector_(faces.permute_vector_) {
87   }
88

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (28.5%)	1.05	0.40	0.55	2.01%	36.06%	68.69%	3.59%	4.76%	3.41%
72 (20.8%)	1.00	0.19	0.25	4.60%	34.48%	65.50%	7.70%	9.18%	5.62%

 89   KOKKOS_INLINE_FUNCTION
 90   void operator()(const int& ii) const {
 91     const int i = permute_vector_(ii);
 92     const int left_index = face_cell_conn_(i, 0);
 93     const int right_index = face_cell_conn_(i, 1);
 94
 95     double flux[5];
 96     double conservatives_l[5];
 97     double conservatives_r[5];
 98     double primitives_l[5];
 99     double primitives_r[5];
100
101     for (int icomp = 0; icomp < 5; ++icomp) {
102       conservatives_l[icomp] = cell_values_(left_index, icomp);
103       conservatives_r[icomp] = cell_values_(right_index, icomp);
104     }
105
106     ComputePrimitives(conservatives_l, primitives_l);
107     ComputePrimitives(conservatives_r, primitives_r);
108
109     if (second_order) {
110
111       //Extrapolation
112       for (int icomp = 0; icomp < 5; ++icomp) {
113         double gradient_primitive_l_tmp = 0;
114         double gradient_primitive_r_tmp = 0;
115
116         for (int idir = 0; idir < 3; ++idir) {
117           gradient_primitive_l_tmp += (face_coordinates_(i, idir)
118                                      - cell_coordinates_(left_index, idir))
119                                      * cell_gradients_(left_index, icomp, idir);
120
121           gradient_primitive_r_tmp += (face_coordinates_(i, idir)
122                                      - cell_coordinates_(right_index, idir))
123                                      * cell_gradients_(right_index, icomp, idir);
124         }
125
126         primitives_l[icomp] += gradient_primitive_l_tmp *
127         cell_limiters_(left_index, icomp);
128         primitives_r[icomp] += gradient_primitive_r_tmp *
129         cell_limiters_(right_index, icomp);
130       }
131
132     } // End of second order
133
134
135     inviscid_flux_evaluator_.compute_flux(primitives_l, primitives_r, flux,
136              &face_normal_(i,0), &face_tangent_(i,0), &face_binormal_(i,0));
137
138     if (ViscousFluxType::isViscous) {
139       double primitives_face[5];
140       double gradients_face[5][3];
141
142       for (int icomp = 0; icomp < 5; ++icomp) {
143         primitives_face[icomp] = 0.5
144             * (primitives_l[icomp] + primitives_r[icomp]);
145
146         for (int idir = 0; idir < 3; ++idir) {
147           gradients_face[icomp][idir] = 0.5
148               * (cell_gradients_(left_index, icomp, idir)
149                   + cell_gradients_(right_index, icomp, idir));
150         }
151       }
152
153       double vflux[5];
154       viscous_flux_evaluator_.compute_flux(gradients_face, primitives_face,
155                                            &face_normal_(i,0), vflux);
156
157       for (int icomp = 0; icomp < 5; ++icomp) {
158         flux[icomp] -= vflux[icomp];
159       }
160     }
161
162 #ifdef ATOMICS_FLUX
163     for (int icomp = 0; icomp < 5; ++icomp)
164     {
165       double * left_cell = &cell_flux_(left_index,0,icomp);
166       Kokkos::atomic_add(left_cell, -flux[icomp]);
167       double * right_cell = &cell_flux_(right_index,0,icomp);
168       Kokkos::atomic_add(right_cell, flux[icomp]);
169     }
170 #endif
171
172 #ifdef CELL_FLUX
173     for (int icomp = 0; icomp < 5; ++icomp)
174     {
175       cell_flux_(left_index,cell_flux_index_(i,0),icomp) = -flux[icomp];
176       cell_flux_(right_index,cell_flux_index_(i,1),icomp) = flux[icomp];
177     }
178 #endif
179
180   }
181
182 };

`green_guass_face GreenGauss.h`

 47 /*green_gauss_face
 48  * functor to compute internal face contributions for Green-Gauss gradient        
       computation.
 49  */
 50
 51 template
 52 struct green_gauss_face{
 53   typedef Device device_type;
 54   typedef typename ViewTypes::c_rnd_scalar_field_type scalar_field_type;
 55   typedef typename ViewTypes::c_rnd_solution_field_type solution_field_type;
 56   typedef typename ViewTypes::c_rnd_face_cell_conn_type face_cell_conn_type;
 57   typedef typename ViewTypes::c_rnd_vector_field_type vector_field_type;
 58   typedef typename ViewTypes::gradient_storage_field_type
                                  gradient_storage_field_type;
 59
 60   scalar_field_type cell_volumes_;
 61   face_cell_conn_type face_cell_conn_;
 62   face_cell_conn_type cell_flux_index_;
 63   solution_field_type cell_values_;
 64   gradient_storage_field_type cell_gradient_;
 65   vector_field_type face_normal_;
 66   Kokkos::View permute_vector_;
 67
 68   green_gauss_face(Faces faces, solution_field_type cell_values,
                       Cells cells):
 69     cell_volumes_(cells.volumes_),
 70     face_cell_conn_(faces.face_cell_conn_),
 71     cell_flux_index_(faces.cell_flux_index_),
 72     cell_values_(cell_values),
 73     cell_gradient_(cells.cell_gradient_),
 74     face_normal_(faces.face_normal_),
 75     permute_vector_(faces.permute_vector_)
 76   {}
 77
 78 KOKKOS_INLINE_FUNCTION
 79 void operator()( const int& ii )const{
 80   const int i = permute_vector_(ii);
 81   const int left_index = face_cell_conn_(i,0);
 82   const int right_index = face_cell_conn_(i,1);
 83
 84   const double gamma = 1.4;
 85   const double Rgas = 287.05;
 86
 87   const double left_r = cell_values_(left_index, 0);
 88   const double left_ri = 1.0 / left_r;
 89   const double left_u = cell_values_(left_index, 1) * left_ri;
 90   const double left_v = cell_values_(left_index, 2) * left_ri;
 91   const double left_w = cell_values_(left_index, 3) * left_ri;
 92   const double left_k = 0.5 * (left_u * left_u + left_v * left_v + left_w * left_w);
 93   const double left_e = cell_values_(left_index, 4) * left_ri - left_k;
 94   const double left_T = left_e * (gamma - 1.0) / Rgas;
 95
 96   const double primitives_l[5] = { left_r, left_u, left_v, left_w, left_T };
 97
 98   const double right_r = cell_values_(right_index, 0);
 99   const double right_ri = 1.0 / right_r;
100   const double right_u = cell_values_(right_index, 1) * right_ri;
101   const double right_v = cell_values_(right_index, 2) * right_ri;
102   const double right_w = cell_values_(right_index, 3) * right_ri;
103   const double right_k = 0.5 * (right_u * right_u + right_v * right_v
                                    + right_w * right_w);
104   const double right_e = cell_values_(right_index, 4) * right_ri - right_k;
105   const double right_T = right_e * (gamma - 1.0) / Rgas;
106
107   const double primitives_r[5] = { right_r, right_u, right_v, right_w, right_T };
108
109   const double cell_vol_left = cell_volumes_(left_index);
110   const double cell_vol_right = cell_volumes_(right_index);
111   const int cell_ind_0 = cell_flux_index_(i,0);
112   const int cell_ind_1 = cell_flux_index_(i,1);
113

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (18.3%)	0.78	0.20	0.24	2.48%	37.27%	26.61%	2.30%	3.04%	0.46%
72 (12.3%)	0.81	0.11	0.13	4.55%	31.87%	19.42%	5.28%	5.32%	0.43%

114   for(int idir = 0; idir < 3; ++idir)
115   {
116     const double face_norm = face_normal_(i,idir);
117
118     for(int icomp = 0; icomp < 5; ++icomp) {
119       const double gradient = 0.5*(primitives_l[icomp]+primitives_r[icomp])
                                  *face_norm;
120
121 #ifdef ATOMICS_FLUX
122       double * left_cell = &cell_gradient_(left_index,0,icomp,idir);
123       Kokkos::atomic_add(left_cell, gradient/cell_vol_left);
124
125       double * right_cell = &cell_gradient_(right_index,0,icomp,idir);
126       Kokkos::atomic_add(right_cell, -gradient/cell_vol_right);
127 #endif
128
129 #ifdef CELL_FLUX
130       cell_gradient_(left_index,cell_ind_0,icomp, idir) = gradient
                                                             /cell_vol_left;
131       cell_gradient_(right_index,cell_ind_1,icomp, idir) = -gradient
                                                              /cell_vol_right;
132 #endif
133     }
134   }
135 }
136
137 };

`min_max_face StencilLimiter.h (52)`

 52 /* min_max_face
 53  * functor to compute the minimum and maximum value at each face
 54  * and scatters to the 2 connected elements.
 55 */
 56 template <class Device, bool interior>
 57 struct min_max_face{
 58   typedef Device device_type;
 59   typedef typename ViewTypes::c_rnd_scalar_field_type scalar_field_type;
 60   typedef typename ViewTypes::c_rnd_solution_field_type solution_field_type;
 61   typedef typename ViewTypes::c_rnd_face_cell_conn_type face_cell_conn_type;
 62   typedef typename ViewTypes::c_rnd_vector_field_type vector_field_type;
 63   typedef typename ViewTypes::cell_storage_field_type cell_storage_field_type;
 64
 65   scalar_field_type cell_volumes_;
 66   face_cell_conn_type face_cell_conn_;
 67   face_cell_conn_type cell_flux_index_;
 68   solution_field_type cell_values_;
 69   vector_field_type face_normal_;
 70   cell_storage_field_type stencil_min_, stencil_max_;
 71   Kokkos::View permute_vector_;
 72
 73   min_max_face(Faces faces, solution_field_type cell_values,
                   Cells cells,
 74     cell_storage_field_type stencil_min, cell_storage_field_type stencil_max):
 75     face_cell_conn_(faces.face_cell_conn_),
 76     cell_flux_index_(faces.cell_flux_index_),
 77     cell_values_(cell_values),
 78     stencil_min_(stencil_min),
 79     stencil_max_(stencil_max),
 80     permute_vector_(faces.permute_vector_)
 81   {}
 82
 83 KOKKOS_INLINE_FUNCTION
 84 void operator()( const int& ii )const{
 85   const int i = permute_vector_(ii);
 86
 87   const int left_index = face_cell_conn_(i,0);
 88   const int right_index = face_cell_conn_(i,1);
 89
 90   double primitives_l[5];
 91   double primitives_r[5];
 92
 93   const double gamma = 1.4;
 94   const double Rgas = 287.05;
 95
 96   if(interior) {
 97     double r = cell_values_(left_index, 0);
 98     double ri = 1.0 / r;
 99     double u = cell_values_(left_index, 1) * ri;
100     double v = cell_values_(left_index, 2) * ri;
101     double w = cell_values_(left_index, 3) * ri;
102     double k = 0.5 * (u * u + v * v + w * w);
103     double e = cell_values_(left_index, 4) * ri - k;
104     double T = e * (gamma - 1.0) / Rgas;
105
106     primitives_l[0] = r;
107     primitives_l[1] = u;
108     primitives_l[2] = v;
109     primitives_l[3] = w;
110     primitives_l[4] = T;
111
112     r = cell_values_(right_index, 0);
113     ri = 1.0 / r;
114     u = cell_values_(right_index, 1) * ri;
115     v = cell_values_(right_index, 2) * ri;
116     w = cell_values_(right_index, 3) * ri;
117     k = 0.5 * (u * u + v * v + w * w);
118     e = cell_values_(right_index, 4) * ri - k;
119     T = e * (gamma - 1.0) / Rgas;
120
121     primitives_r[0] = r;
122     primitives_r[1] = u;
123     primitives_r[2] = v;
124     primitives_r[3] = w;
125     primitives_r[4] = T;
126   } else {
127     const double r = cell_values_(left_index, 0);
128     const double ri = 1.0 / r;
129     const double u = cell_values_(left_index, 1) * ri;
130     const double v = cell_values_(left_index, 2) * ri;
131     const double w = cell_values_(left_index, 3) * ri;
132     const double k = 0.5 * (u * u + v * v + w * w);
133     const double e = cell_values_(left_index, 4) * ri - k;
134     const double T = e * (gamma - 1.0) / Rgas;
135
136     primitives_l[0] = r;
137     primitives_l[1] = u;
138     primitives_l[2] = v;
139     primitives_l[3] = w;
140     primitives_l[4] = T;
141   }
142
143   const int cell_ind_0 = cell_flux_index_(i,0);
144   const int cell_ind_1 = cell_flux_index_(i,1);
145

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (16.2%)	0.53	0.20	0.18	4.69%	28.41%	5.09%	3.60%	4.63%	0.05%
72 (7.9%)	0.59	0.08	0.09	9.48%	34.69%	10.68%	8.94%	9.23%	0.30%

146   for (int icomp = 0; icomp < 5; ++icomp)
147   {
148     const double face_min = interior ? STENCIL_MIN(
           primitives_r[icomp], primitives_l[icomp]) : primitives_l[icomp];
149     const double face_max = interior ? STENCIL_MAX(
           primitives_r[icomp], primitives_l[icomp]) : primitives_l[icomp];
150
151 #ifdef ATOMICS_FLUX
152     //Need compare and exhange here instead of atomic add
153
154     double * left_cell_min = &stencil_min_(left_index,0,icomp);
155     bool success=false;
156     do{
157       double old_left_min = *left_cell_min;
158       double new_left_min = MathTools::min(*left_cell_min, face_min);
159       double new_value = Kokkos::atomic_compare_exchange(
                                left_cell_min, old_left_min, new_left_min);
160       success = new_value == new_left_min;
161     } while(!success);
162     double * left_cell_max = &stencil_max_(left_index,0,icomp);
163     success=false;
164     do{
165       double old_left_max = *left_cell_max;
166       double new_left_max = MathTools::max(*left_cell_max, face_max);
167       double new_value = Kokkos::atomic_compare_exchange(
                                left_cell_max, old_left_max, new_left_max);
168       success = new_value == new_left_max;
169     } while(!success);
170
171     if(interior){
172       double * right_cell_min = &stencil_min_(right_index,0,icomp);
173       success=false;
174       do{
175         double old_right_min = *right_cell_min;
176         double new_right_min = MathTools::min(*right_cell_min, face_min);
177         double new_value = Kokkos::atomic_compare_exchange(
                                 right_cell_min, old_right_min, new_right_min);
178         success = new_value == new_right_min;
179       } while(!success);
180       double * right_cell_max = &stencil_max_(right_index,0,icomp);
181       success=false;
182       do{
183         double old_right_max = *right_cell_max;
184         double new_right_max = MathTools::max(*right_cell_max, face_max);
185         double new_value = Kokkos::atomic_compare_exchange(
                                 right_cell_max, old_right_max, new_right_max);
186         success = new_value == new_right_max;
187       } while(!success);
188     }
189 #endif
190
191 #ifdef CELL_FLUX
192     stencil_min_(left_index, cell_ind_0, icomp) = face_min;
193     stencil_max_(left_index, cell_ind_0, icomp) = face_max;
194
195     if(interior){
196       stencil_min_(right_index, cell_ind_1, icomp) = face_min;
197       stencil_max_(right_index, cell_ind_1, icomp) = face_max;
198     }
199 #endif
200   }
201 }

`compute_tangentBC_flux Tangent_BC.h`

 43 /* compute_tangentBC_flux
 44  * functor to compute the contribution of an tangent boundary condition
 45  * state is set such that the normal velocity at the boundary is zero
 46  */
 47 template <class Device, class FluxType>
 48 struct compute_tangentBC_flux {
 49   typedef Device device_type;
 50   typedef typename ViewTypes::c_rnd_solution_field_type solution_field_type;
 51   typedef typename ViewTypes::c_rnd_face_cell_conn_type face_cell_conn_type;
 52   typedef typename ViewTypes::c_vector_field_type vector_field_type;
 53   typedef typename ViewTypes::cell_storage_field_type cell_storage_field_type;
 54
 55   face_cell_conn_type face_cell_conn_;
 56   face_cell_conn_type cell_flux_index_;
 57   solution_field_type cell_values_;
 58   cell_storage_field_type cell_flux_;
 59   vector_field_type face_normal_, face_tangent_, face_binormal_;
 60   FluxType flux_evaluator_;
 61
 62   compute_tangentBC_flux(Faces faces, solution_field_type cell_values,
 63     Cells cells, FluxType flux) :
 64     face_cell_conn_(faces.face_cell_conn_), cell_flux_index_(
 65       faces.cell_flux_index_), cell_values_(cell_values), cell_flux_(
 66       cells.cell_flux_), face_normal_(faces.face_normal_), face_tangent_(
 67       faces.face_tangent_), face_binormal_(faces.face_binormal_), flux_evaluator_(
 68       flux) {
 69   }
 70

Threads (Time)	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
1 (4.6%)	1.61	0.52	0.76	1.02%	23.37%	33.88%	3.15%	3.35%	0.01%
72 (2.5%)	2.13	0.34	0.50	2.04%	27.46%	11.88%	7.66%	8.67%	0.09%

 71   KOKKOS_INLINE_FUNCTION
 72   void operator()(int i) const {
 73     int index = face_cell_conn_(i, 0);
 74
 75     double flux[5];
 76     double conservatives[5];
 77     double primitives_r[5];
 78     double primitives_l[5];
 79
 80     for (int icomp = 0; icomp < 5; ++icomp) {
 81       conservatives[icomp] = cell_values_(index, icomp);
 82     }
 83
 84     ComputePrimitives(conservatives, primitives_l);
 85
 86     //scale normal since it includes area.
 87     double area_norm = 0;
 88     for (int icomp = 0; icomp < 3; ++icomp) {
 89       area_norm += face_normal_(i, icomp) * face_normal_(i, icomp);
 90     }
 91     area_norm = std::sqrt(area_norm);
 92
 93     double uboundary = 0.0;
 94     uboundary += primitives_l[1] * face_normal_(i, 0) / area_norm;
 95     uboundary += primitives_l[2] * face_normal_(i, 1) / area_norm;
 96     uboundary += primitives_l[3] * face_normal_(i, 2) / area_norm;
 97
 98     primitives_r[0] = primitives_l[0]; 
 99     primitives_r[1] = primitives_l[1] - 2 * uboundary * face_normal_(i, 0)
                          / area_norm;
100     primitives_r[2] = primitives_l[2] - 2 * uboundary * face_normal_(i, 1)
                          / area_norm;
101     primitives_r[3] = primitives_l[3] - 2 * uboundary * face_normal_(i, 2)
                          / area_norm;
102     primitives_r[4] = primitives_l[4];
103
104     flux_evaluator_.compute_flux(primitives_l, primitives_r, flux,             
                                     &face_normal_(i,0),
105                                  &face_tangent_(i,0), &face_binormal_(i,0));
106
107 #ifdef ATOMICS_FLUX
108     for (int icomp = 0; icomp < 5; ++icomp)
109     {
110       double * cell = &cell_flux_(index,0,icomp);
111       Kokkos::atomic_add(cell, -flux[icomp]);
112     }
113 #endif
114
115 #ifdef CELL_FLUX
116     for (int icomp = 0; icomp < 5; ++icomp)
117     {
118       cell_flux_(index,cell_flux_index_(i,0),icomp) = -flux[icomp];
119     }
120 #endif
121
122   }
123 };