miniGMG

From Website:

miniGMG is a compact benchmark for understanding the performance challenges associated with geometric multigrid solvers found in applications built from AMR MG frameworks like CHOMBO or BoxLib when running on modern multi- and manycore-based supercomputers. It includes both productive reference examples as well as highly-optimized implementations for CPUs and GPUs. It is sufficiently general that it has been used to evaluate a broad range of research topics including PGAS programming models and algorithmic tradeoffs inherit in multigrid.
Note, miniGMG code has been supersceded by HPGMG.


Problem Size and Run Configuration

./run.miniGMG log2BoxSize                                             \
              [BoxesPerProcess_i BoxesPerProcess_j BoxesPerProcess_k] \
              [Processes_i Processes_j Processes_k]

log2BoxSize = 6 is a good proxy for real applications


Analysis


Build and Run Information

Compiler = icpc (ICC) 18.0.1 20171018
Build Flags = -g -O3 -march=native
Run Parameters = 8  2 2 2  1 1 1

Scaling


Intel Software Development Emulator

SDE Metrics
miniGMG
Arithmetic Intensity 0.13
Bytes per Load Inst 17.94
Bytes per Store Inst 9.31
FLOPS per Inst 1.19

Roofline – Intel(R) Xeon(R) Platinum 8180M CPU

112 Threads – 56 – Cores 3200.0 Mhz


Experiment Aggregate Metrics

Threads (Time)
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
1 (100.0%) 0.78 0.39 0.33 20.64% 51.89% 79.55% 3.91% 9.03% 0.65%
112 (100.0%) 0.71 0.14 0.08 17.80% 51.04% 89.96% 3.36% 0.00% 1.56%

Jacobi

  9 void smooth(domain_type * domain, int level, int phi_id,
                              int rhs_id, double a, double b){
 10   if(numSmooths&1){
 11     printf("error - numSmooths must be even...\n");
 12     exit(0);
 13   }
 14 
 15   int CollaborativeThreadingBoxSize = 100000; // i.e. never
 16   #ifdef __COLLABORATIVE_THREADING
 17     CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
 18   #endif
 19   int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i 
                              <  CollaborativeThreadingBoxSize);
 20   int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i 
                              >= CollaborativeThreadingBoxSize);
 21 
 22   int box,s;
 23   int ghosts = domain->ghosts;
 24   double TwoThirds = 2.0/3.0;
 25 
 26   // if communication-avoiding, need RHS for stencils in ghost zones
 27   if(ghosts>1)exchange_boundary(domain,level,rhs_id,1,1,1);
 28 
 29   for(s=0;s<numSmooths;s+=ghosts){
 30     // Jacobi ping pongs between phi and __temp
        // corners/edges if doing communication-avoiding...
 31     if((s&1)==0)exchange_boundary(domain,level,phi_id,1,ghosts>1,ghosts>1); 
        // corners/edges if doing communication-avoiding...
 32     else exchange_boundary(domain,level,__temp,1,ghosts>1,ghosts>1);  
 33 
 34     // now do ghosts communication-avoiding smooths on each box...
 35     uint64_t _timeStart = CycleTime();
 36 
 37     #pragma omp parallel for private(box) if(omp_across_boxes)
 38     for(box=0;box<domain->subdomains_per_rank;box++){
 39       int i,j,k,ss;
 40       int pencil = domain->subdomains[box].levels[level].pencil;
 41       int  plane = domain->subdomains[box].levels[level].plane;
 42       int ghosts = domain->subdomains[box].levels[level].ghosts;
 43       int  dim_k = domain->subdomains[box].levels[level].dim.k;
 44       int  dim_j = domain->subdomains[box].levels[level].dim.j;
 45       int  dim_i = domain->subdomains[box].levels[level].dim.i;
 46       double h2inv = 1.0/(domain->h[level]*domain->h[level]);
 47       double * __restrict__ rhs    = domain->
              subdomains[box].levels[level].grids[  rhs_id] + ghosts*(1+pencil+plane);
 48       double * __restrict__ alpha  = domain->
              subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane);
 49       double * __restrict__ beta_i = domain->
              subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane);
 50       double * __restrict__ beta_j = domain->
              subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane);
 51       double * __restrict__ beta_k = domain->
              subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
 52       double * __restrict__ lambda = domain->
              subdomains[box].levels[level].grids[__lambda] + ghosts*(1+pencil+plane);
 53 
 54       int ghostsToOperateOn=ghosts-1;
 55       for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
 56         double * __restrict__ phi;
 57         double * __restrict__ phi_new;
 58               if((ss&1)==0){phi    = domain->subdomains[box].levels[level].grids[phi_id]
                                         + ghosts*(1+pencil+plane);
 59                             phi_new= domain->subdomains[box].levels[level].grids[__temp]
                                         + ghosts*(1+pencil+plane);}
 60                        else{phi    = domain->subdomains[box].levels[level].grids[__temp]
                                         + ghosts*(1+pencil+plane);
 61                             phi_new= domain->subdomains[box].levels[level].grids[phi_id]
                                         + ghosts*(1+pencil+plane);}
Threads (Time)
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
1 (68.7%) 0.75 0.42 0.35 21.50% 50.89% 71.19% 4.30% 9.87% 0.31%
112 (50.2) 0.33 0.08 0.02 48.36% 50.61% 90.37% 4.30% 0.00% 1.68%
 62         #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
 63         for(k=0-ghostsToOperateOn;k<dim_k+ghostsToOperateOn;k++){
 64         for(j=0-ghostsToOperateOn;j<dim_j+ghostsToOperateOn;j++){
 65         for(i=0-ghostsToOperateOn;i<dim_i+ghostsToOperateOn;i++){
 66           int ijk = i + j*pencil + k*plane;
 67           double helmholtz =  a*alpha[ijk]*phi[ijk]
 68                              -b*h2inv*(
 69                                 beta_i[ijk+1     ]*( phi[ijk+1     ]-phi[ijk       ] )
 70                                -beta_i[ijk       ]*( phi[ijk       ]-phi[ijk-1     ] )
 71                                +beta_j[ijk+pencil]*( phi[ijk+pencil]-phi[ijk       ] )
 72                                -beta_j[ijk       ]*( phi[ijk       ]-phi[ijk-pencil] )
 73                                +beta_k[ijk+plane ]*( phi[ijk+plane ]-phi[ijk       ] )
 74                                -beta_k[ijk       ]*( phi[ijk       ]-phi[ijk-plane ] )
 75                               );
 76           phi_new[ijk] = phi[ijk] - TwoThirds*lambda[ijk]*(helmholtz-rhs[ijk]);
 77         }}}
 78       } // ss-loop
 79     } // box-loop
 80     domain->cycles.smooth[level] += (uint64_t)(CycleTime()-_timeStart);
 81   } // s-loop
 82 }


Residual

  9 void residual(domain_type * domain, int level,  int res_id, int phi_id,
                                             int rhs_id, double a, double b){
 10   // exchange the boundary for x in prep for Ax...
 11   // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
 12   exchange_boundary(domain,level,phi_id,1,0,0);
 13 
 14   // now do residual/restriction proper...
 15   uint64_t _timeStart = CycleTime();
 16   int CollaborativeThreadingBoxSize = 100000; // i.e. never
 17   #ifdef __COLLABORATIVE_THREADING
 18     CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
 19   #endif
 20   int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
 21   int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
 22   int box;
 23 
 24   #pragma omp parallel for private(box) if(omp_across_boxes)
 25   for(box=0;box<domain->subdomains_per_rank;box++){
 26     int i,j,k;
 27     int pencil = domain->subdomains[box].levels[level].pencil;
 28     int  plane = domain->subdomains[box].levels[level].plane;
 29     int ghosts = domain->subdomains[box].levels[level].ghosts;
 30     int  dim_k = domain->subdomains[box].levels[level].dim.k;
 31     int  dim_j = domain->subdomains[box].levels[level].dim.j;
 32     int  dim_i = domain->subdomains[box].levels[level].dim.i;
 33     double h2inv = 1.0/(domain->h[level]*domain->h[level]);
        // i.e. [0] = first non ghost zone point
 34     double * __restrict__ phi    = domain->subdomains[box].levels[level].grids[  phi_id]
                                       + ghosts*(1+pencil+plane); 
 35     double * __restrict__ rhs    = domain->subdomains[box].levels[level].grids[  rhs_id]
                                       + ghosts*(1+pencil+plane);
 36     double * __restrict__ alpha  = domain->subdomains[box].levels[level].grids[__alpha ]
                                       + ghosts*(1+pencil+plane);
 37     double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i]
                                       + ghosts*(1+pencil+plane);
 38     double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j]
                                       + ghosts*(1+pencil+plane);
 39     double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k]
                                       + ghosts*(1+pencil+plane);
 40     double * __restrict__ res    = domain->subdomains[box].levels[level].grids[  res_id]
                                       + ghosts*(1+pencil+plane);
 41 
Threads (Time)
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
1 (14.3%) 0.85 0.42 0.34 21.93% 51.41% 68.72% 4.39% 10.03% 0.34%
112 (10.7%) 0.35 0.08 0.02 49.80% 50.00% 90.58% 4.24% 0.00% 1.71%
 42     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
 43     for(k=0;k<dim_k;k++){
 44     for(j=0;j<dim_j;j++){
 45     for(i=0;i<dim_i;i++){
 46       int ijk = i + j*pencil + k*plane;
 47       double helmholtz =  a*alpha[ijk]*phi[ijk]
 48                          -b*h2inv*(
 49                             beta_i[ijk+1     ]*( phi[ijk+1     ]-phi[ijk       ] )
 50                            -beta_i[ijk       ]*( phi[ijk       ]-phi[ijk-1     ] )
 51                            +beta_j[ijk+pencil]*( phi[ijk+pencil]-phi[ijk       ] )
 52                            -beta_j[ijk       ]*( phi[ijk       ]-phi[ijk-pencil] )
 53                            +beta_k[ijk+plane ]*( phi[ijk+plane ]-phi[ijk       ] )
 54                            -beta_k[ijk       ]*( phi[ijk       ]-phi[ijk-plane ] )
 55                           );
 56       res[ijk] = rhs[ijk]-helmholtz;
 57     }}}
 58   }
 59   domain->cycles.residual[level] += (uint64_t)(CycleTime()-_timeStart);
 60 }