SNBone

From README.txt:

This program emulates the inversion of A in A*x=S on a single node arch. This operation was identified as the bulk of the flop work and the largest memory issue for deterministic neutronics

This mini-app has three solution modes:

The AVE-1 scheme refers to the standard matrix-free matrix-vector applications where gaussian integration is used
The AVE-2 scheme refers to a slightly different approach from AVE-1 where intermediate vectors are used.
The AVE-3 scheme refers to a conventional assembled matrix in compressed sparse row storage.

To date, AVE-3 consistently outperforms AVE-1 and AVE-2 such that fully assembling the matrix and performing the solve is more efficient than doing the gaussian integration solve.

Build and Problem Configuration

From README.txt:

The code is broken into three steps.

You create an unstructured mesh but it has a regular stencil such that the bandwidth is not as severe as a conventional FE mesh. You can theoretically provide a linear tetrahedral finite element mesh from a conventional mesh generator as an alternative. Ask for details.
You process this mesh with respect to the maximum number of threads you want to investigate
The mesh will be re-ordered with respect to element and vertex such that it can be applied in n*thread independent steps.
You run the mini-app (fortran or c version) to look at the overall performance on a particular architecture

Analysis

Build and Run Information

Compiler = icpc (ICC) 18.0.1 20171018
Build_Flags = -g -O3 -march=native -DUSEMETIS
Run_Parameters = 0 100 30 32 1

Setup Parameters

makemesh.x 10 10 10 0
processmesh.x 1 1

Run with 1 Thread on 1 Node

Intel Software Development Emulator

SDE Metrics	SNbone
Arithmetic Intensity	0.13
Bytes per Load Inst	7.98
Bytes per Store Inst	6.95

Roofline – Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

Experiment Aggregate Metrics

IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
2.10	1.13	1.20	0.27%	30.60%	24.30%	1.26%	1.33%	0.30%

`ApplyA_AVE1_Tet_SUPG`

 35 // Main subroutine header
 36 void ApplyA_AVE1_Tet_SUPG(int *NE, int *NA, int *NV, int *C,
 37                           int *AS_NumColors,int *AS_NumThreads,
                              int *TasksPerThread,int *MyThreadID,
                              int *ThreadWiseWork,
 38                           double *CF, double *CU, double *CP,
                              double *FES, double *FED, double *FEW,
                              double *OM, double *OO,
 39                           double *LHS, double *RHS) {
 40 // Problem size arguments and mesh data
 41 //INTEGER,     INTENT(IN) :: NE           // Number of Es
 42 //INTEGER,     INTENT(IN) :: NA           // Number of angles
 43 //INTEGER,     INTENT(IN) :: NV           // Number of vertices in the mesh
 44 //INTEGER,     INTENT(IN) :: C(4*NE)      // The connectivity matrix
 45 // Thread size arguments and work seperation
 46 //INTEGER,     INTENT(IN) :: AS_NumColors,AS_NumThreads,TasksPerThread,MyThreadID
 47 //INTEGER,     INTENT(IN) :: ThreadWiseWork(2,AS_NumColors,AS_NumThreads) 
                                 ! The element start/stop for each thread in each color
 48 // Cross section vectors
 49 //REAL*8,    INTENT(IN) :: CF(NE)
 50 //REAL*8,    INTENT(IN) :: CU(NE)
 51 //REAL*8,    INTENT(IN) :: CP(NE)
 52 // FE shape functions
 53 //REAL*8, INTENT(IN) :: FES(4*4)      // Vertices,GP     // Shape
 54 //REAL*8, INTENT(IN) :: FED(4*3*4*NE) // Vertices,Dim,GP 
                                          // Derivatives in real space
 55 //REAL*8, INTENT(IN) :: FEW(4*NE)     // GP              // Weight * |Jac|
 56 // Angular cubature
 57 //REAL*8, INTENT(IN) :: OM(NA*3)              // Angular cubature evaluation
 58 //REAL*8, INTENT(IN) :: OO(NA*6)              // Basically it is OM(A,*) * OM(A,*)
 59 // Vectors
 60 //REAL*8, INTENT(INOUT) :: LHS(NA*NV) // LHS (to be modified)
 61 //REAL*8,    INTENT(IN) :: RHS(NA*NV) // RHS (to be multiplied)
 62
 63 // Local
 64 int E,A,V,VV,GP,iNA;
 65 int iCV_E,iCVV_E,iGP,iEGP,iEGP1,iEGP2,iEGP3;
 66 int iColor,iStart,iEnd;
 67 int AS_Thread,iTask,ifun;
 68
 69 iNA = *NA;
 70 //printf("NA=%d NE=%d NV=%d \n",*NA,*NE,*NV);
 71
 72 for (iColor = 1; iColor < *AS_NumColors+1; iColor++) {
 73 #ifdef WITHOMP
    {...}
 81 #else
 82    iStart = 1;
 83    iEnd   = *NE;
 84 #endif
 85    for (E = iStart; E < iEnd+1; E++) {
 86       for (V = 1; V < 5; V++) {
 87          //printf("E=%d V=%d ii=%d \n",E,V,(E-1)*4+V-1);
 88          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1;
 89          for (GP = 1; GP < 5; GP++) {
 90             iGP  = (GP-1)*4-1;
 91             iEGP  = (E-1)*4+GP-1;
 92             iEGP1 = (E-1)*4*3*4 + (GP-1)*3*4 + (1-1)*4-1;
 93             iEGP2 = (E-1)*4*3*4 + (GP-1)*3*4 + (2-1)*4-1;
 94             iEGP3 = (E-1)*4*3*4 + (GP-1)*3*4 + (3-1)*4-1;
 95             for (VV = 1; VV < 5; VV++) {
 96                iCVV_E = (C[(E-1)*4+VV-1]-1)*iNA-1;

Time	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
51.4%	2.09	1.48	1.52	0.05%	22.54%	38.01%	0.27%	0.18%	0.01%

 97                for (A = 1; A < iNA+1; A++) {
                      /*...Commented Out Code...}*/
104                // There are 4+15+45*GP*V*V=64*4*4*4=4096 mults per element-angle
                   // and 13*GP*V*V=832 adds so 4928 flops
105                   LHS[iCV_E+A] = LHS[iCV_E+A]
106                                 + FEW[iEGP]*CF[E-1] *FES[iGP+V]  
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // F
107                                 + FEW[iEGP]*CU[E-1]*OM[A-1] *FED[iEGP1+V]
                                     *FES[iGP+VV] *RHS[iCVV_E+A]    // U^T[1]
108                                 + FEW[iEGP]*CU[E-1]*OM[iNA+A-1]  *FED[iEGP2+V]
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // U^T[2]
109                                 + FEW[iEGP]*CU[E-1]*OM[2*iNA+A-1]*FED[iEGP3+V]
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // U^T[3]
110                                 + FEW[iEGP]*CP[E-1]*OO[A-1]      *FED[iEGP1+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[1,1]
111                                 + FEW[iEGP]*CP[E-1]*OO[iNA+A-1]  *FED[iEGP2+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A]    // P[2,2]
112                                 + FEW[iEGP]*CP[E-1]*OO[2*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[3,3]
113                                 + FEW[iEGP]*CP[E-1]*OO[3*iNA+A-1]*FED[iEGP1+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A]    // P[1,2]
114                                 + FEW[iEGP]*CP[E-1]*OO[4*iNA+A-1]*FED[iEGP1+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[1,3]
115                                 + FEW[iEGP]*CP[E-1]*OO[5*iNA+A-1]*FED[iEGP2+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[2,3]
116                                 + FEW[iEGP]*CP[E-1]*OO[3*iNA+A-1]*FED[iEGP2+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[2,1]
117                                 + FEW[iEGP]*CP[E-1]*OO[4*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[3,1]
118                                 + FEW[iEGP]*CP[E-1]*OO[5*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A] ;  // P[3,2]
                      /*...Commented Out Code...*/
132                } // A
133             } // VV
134          } // GP
135       } // V
136    } // E
137 #ifdef WITHOMP
138  } // iTask
139 #pragma omp barrier
140 #else
141    break;
142 #endif
143 } // iColor
144
145 } // END SUBROUTINE ApplyA_AVE1_Tet_SUPG

`ApplyA_AVE2_Tet_SUPG`

 35 // Main subroutine header
 36 void ApplyA_AVE2_Tet_SUPG(int *NE, int *NA, int *NV, int *C,
 37                           int *AS_NumColors,int *AS_NumThreads,int *TasksPerThread,
                              int *MyThreadID,int *ThreadWiseWork,
 38                           double *CF, double *CU, double *CP, double *FES,
                              double *FED, double *FEW, double *OM, double *OO,
 39                           double *LHS, double *RHS,double *SRHS,double *SLHS) {
 40 //IMPLICIT NONE
 41 //#include "PROTEUS_Preprocess.h"
 42 //#define Local_TryLHStoo
 43 // Problem size arguments and mesh data
 44 //PROTEUS_Int,     INTENT(IN) :: NE            // Number of Es
 45 //PROTEUS_Int,     INTENT(IN) :: NA            // Number of angles
 46 //PROTEUS_Int,     INTENT(IN) :: NV            // Number of vertices in the mesh
 47 //PROTEUS_Int,     INTENT(IN) :: C(4,NE)       // The connectivity matrix
 48 //// Thread size arguments and work seperation
 49 //PROTEUS_Int,     INTENT(IN) :: AS_NumColors,AS_NumThreads,TasksPerThread,MyThreadID
 50 //PROTEUS_Int,     INTENT(IN) :: ThreadWiseWork(2,AS_NumColors,AS_NumThreads) 
                                     // The element start/stop for each thread in each color
 51 //// Cross section vectors
 52 //PROTEUS_Real,    INTENT(IN) :: CF(NE)
 53 //PROTEUS_Real,    INTENT(IN) :: CU(NE)
 54 //PROTEUS_Real,    INTENT(IN) :: CP(NE)
 55 //// FE shape functions
 56 //PROTEUS_FE_Real, INTENT(IN) :: FES(4,4)      // Vertices,GP     // Shape
 57 //PROTEUS_FE_Real, INTENT(IN) :: FED(4,3,4,NE) // Vertices,Dim,GP 
                                                   // Derivatives in real space
 58 //PROTEUS_FE_Real, INTENT(IN) :: FEW(4,NE)     // GP              // Weight * |Jac|
 59 //// Angular cubature
 60 //PROTEUS_FE_Real, INTENT(IN) :: OM(NA,3)         // Angular cubature evaluation
 61 //PROTEUS_FE_Real, INTENT(IN) :: OO(NA,6)         // Basically it is OM(A,*) * OM(A,*)
 62 //// Vectors
 63 //PROTEUS_Real, INTENT(INOUT) :: LHS(NA,NV) // LHS (to be modified)
 64 //PROTEUS_Real,    INTENT(IN) :: RHS(NA,NV) // RHS (to be multiplied)
 65 //// Scratch vectors
 66 //PROTEUS_Real, INTENT(INOUT) :: SRHS(NA,4),SLHS(NA,4)
 67
 68 // Local
 69 int E,A,V,VV,GP,iNA;
 70 int iCV_E,iCVV_E,iGP,iEGP,iEGP1,iEGP2,iEGP3;
 71 int iColor,iStart,iEnd;
 72 int AS_Thread,iTask,ifun;
 73 double LocalConst1,LocalConst2,LocalConst3,Products[11];
 74
 75 iNA = *NA;
 76
 77 //printf("NA=%d NE=%d NV=%d \n",*NA,*NE,*NV);
 78
 79 for (iColor = 1; iColor < *AS_NumColors+1; iColor++) {
 80 #ifdef WITHOMP
 81  for (iTask = 1; iTask < *TasksPerThread+1; iTask++) {
 82    AS_Thread = *TasksPerThread * (*MyThreadID-1) + iTask;
 83    ifun = *AS_NumColors * (AS_Thread-1)*2 + 2*(iColor-1);
 84    iStart   = ThreadWiseWork[ifun+1-1];
 85    iEnd     = ThreadWiseWork[ifun+2-1];
 86 //   iStart   = ThreadWiseWork[(iColor-1)*2+1-1];
 87 //   iEnd     = ThreadWiseWork[(iColor-1)*2+2-1];
 88 #else
 89    iStart = 1;
 90    iEnd   = *NE;
 91 #endif
 92    for (E = iStart; E < iEnd+1; E++) {
 93       for (V = 1; V < 5; V++) {
 94          // Pull a copy of the vectors
 95          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1;
 96          for (A = 1; A < iNA+1; A++) {
 97             SRHS[(V-1)*iNA+A-1] = RHS[iCV_E + A];
 98 #ifdef Local_TryLHStoo
 99             SLHS[(V-1)*iNA+A-1] = LHS[iCV_E + A];
100 #endif
101          }
102       }
103       for (V = 1; V < 5; V++) {
104 #ifndef Local_TryLHStoo
105          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1; // II = C(V,E)
106 #endif
107          for (GP = 1; GP < 5; GP++) {
108             iGP  = (GP-1)*4-1;
109             iEGP  = (E-1)*4+GP-1;
110             iEGP1 = (E-1)*4*3*4 + (GP-1)*3*4 + (1-1)*4-1;
111             iEGP2 = (E-1)*4*3*4 + (GP-1)*3*4 + (2-1)*4-1;
112             iEGP3 = (E-1)*4*3*4 + (GP-1)*3*4 + (3-1)*4-1;
113
114             LocalConst1 = FEW[iEGP]*CF[E-1];
115             LocalConst2 = FEW[iEGP]*CU[E-1];
116             LocalConst3 = FEW[iEGP]*CP[E-1];
117             for (VV = 1; VV < 5; VV++) {
118                iCVV_E = (VV-1)*iNA-1; //(C[(E-1)*4+VV-1]-1)*iNA-1;
119                Products[ 1] = LocalConst1*FES[iGP+V]  *FES[iGP+VV];
120                Products[ 2] = LocalConst2*FED[iEGP1+V]*FES[iGP+VV];
121                Products[ 3] = LocalConst2*FED[iEGP2+V]*FES[iGP+VV];
122                Products[ 4] = LocalConst2*FED[iEGP3+V]*FES[iGP+VV];
123                Products[ 5] = LocalConst3*FED[iEGP1+V]*FED[iEGP1+VV];
124                Products[ 6] = LocalConst3*FED[iEGP2+V]*FED[iEGP2+VV];
125                Products[ 7] = LocalConst3*FED[iEGP3+V]*FED[iEGP3+VV];
126                Products[ 8] = LocalConst3*(FED[iEGP1+V]*FED[iEGP2+VV]
                                  +FED[iEGP2+V]*FED[iEGP1+VV]);
127                Products[ 9] = LocalConst3*(FED[iEGP1+V]*FED[iEGP3+VV]
                                  +FED[iEGP3+V]*FED[iEGP1+VV]);
128                Products[10] = LocalConst3*(FED[iEGP2+V]*FED[iEGP3+VV]
                                  +FED[iEGP3+V]*FED[iEGP2+VV]);

Time	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
22.1%	1.95	1.09	1.21	0.07%	23.10%	31.02%	0.30%	0.29%	0.01%

129                for (A = 1; A < iNA+1; A++) {
130 #ifdef Local_TryLHStoo
131                   SLHS[iCVV_E+A] = SLHS[iCVV_E+A]
132                                 + Products[ 1]              *SRHS[iCVV_E+A]    // F
133                                 + Products[ 2]*OM[A-1]      *SRHS[iCVV_E+A]    // U^T(1)
134                                 + Products[ 3]*OM[iNA+A-1]  *SRHS[iCVV_E+A]    // U^T(2)
135                                 + Products[ 4]*OM[2*iNA+A-1]*SRHS[iCVV_E+A]    // U^T(3)
136                                 + Products[ 5]*OO[A-1]      *SRHS[iCVV_E+A]    // P(1,1)
137                                 + Products[ 6]*OO[iNA+A-1]  *SRHS[iCVV_E+A]    // P(2,2)
138                                 + Products[ 7]*OO[2*iNA+A-1]*SRHS[iCVV_E+A]    // P(3,3)
139                                 + Products[ 8]*OO[3*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,2)
140                                 + Products[ 9]*OO[4*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,3)
141                                 + Products[10]*OO[5*iNA+A-1]*SRHS[iCVV_E+A];   // P(2,3)
142 #else
143                   LHS[iCV_E+A] = LHS[iCV_E+A]
144                                 + Products[ 1]              *SRHS[iCVV_E+A]    // F
145                                 + Products[ 2]*OM[A-1]      *SRHS[iCVV_E+A]    // U^T(1)
146                                 + Products[ 3]*OM[iNA+A-1]  *SRHS[iCVV_E+A]    // U^T(2)
147                                 + Products[ 4]*OM[2*iNA+A-1]*SRHS[iCVV_E+A]    // U^T(3)
148                                 + Products[ 5]*OO[A-1]      *SRHS[iCVV_E+A]    // P(1,1)
149                                 + Products[ 6]*OO[iNA+A-1]  *SRHS[iCVV_E+A]    // P(2,2)
150                                 + Products[ 7]*OO[2*iNA+A-1]*SRHS[iCVV_E+A]    // P(3,3)
151                                 + Products[ 8]*OO[3*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,2)
152                                 + Products[ 9]*OO[4*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,3)
153                                 + Products[10]*OO[5*iNA+A-1]*SRHS[iCVV_E+A];   // P(2,3)
154 #endif
                      /*...Commented Out Code...*/
165                } // A
166             } // VV
167          } // GP
168       } // V
169       // Store the LHS result
170 #ifdef Local_TryLHStoo
171       for (V = 1; V < 5; V++) {
172          iCV_E = C[(E-1)*4+V-1]-1; // VV = C(V,E)
173                for (A = 1; A < iNA+1; A++) {
174             LHS[iCV_E*iNA + A-1] = SLHS[(V-1)*iNA+A-1];
175          }
176       }
177 #endif
178    } // E
179 #ifdef WITHOMP
180  } // iTask
181 #pragma omp barrier
182 #else
183    break;
184 #endif
185 } // iColor
186
187 } // END SUBROUTINE ApplyA_AVE2_Tet_SUPG

`SolveWGS_PassThrough_AVE3`

 23 // Main subroutine header
 24 void SolveWGS_PassThrough_AVE3(double *RHS_C, double *LHS_C) {
 25 // double LHS_C(NumAngles*NumVertices),RHS_C(NumAngles*NumVertices)
 26 int MyThreadID;
 27 int I,J,K,iRowStart,iRowEnd,iV_A,iVV_A,iOff;
 28
 29 #ifdef WITHBGQHPM
 30    if (MyThreadID == 1) {hpm_start('AVE3_ApplyA');}
 31 #endif
 32
 33 #ifdef WITHOMP
 34    MyThreadID = omp_get_thread_num() + 1;
 35    I = NumVertices/NumThreads;
 36    iRowStart = (MyThreadID-1)*I + 1;
 37    iRowEnd   = MyThreadID*I;
 38    if (MyThreadID == NumThreads) {iRowEnd = NumVertices;}
 39 #else
 40    MyThreadID = 1;
 41    iRowStart = 1;
 42    iRowEnd   = NumVertices;
 43 #endif
 44 // This barrier ensures that the incoming vector is fully defined by all threads
 45 #ifdef WITHOMP
 46 #pragma omp barrier
 47 #endif

Time	IPC per Core	Loads per Cycle	L1 Hits per Cycle	L1 Miss Ratio	L2 Miss Ratio	L3 Miss Ratio	L2 B/W Utilized	L3 B/W Utilized	DRAM B/W Utilized
0.02%	2.56	0.82	0.95	8.89%	34.93%	23.70%	29.50%	39.51%	3.39%

 48 for (I = iRowStart; I < iRowEnd+1; I++) {
 49    for (J = NZS_RowLoc[I-1]; J < NZS_RowLoc[I]; J++) {
 50       iV_A = (I-1)*NumAngles-1;
 51       iVV_A = (NZS_ColNum[J-1]-1)*NumAngles-1;
 52       iOff  = (J-1)*NumAngles-1;
 53       for (K = 1; K < NumAngles+1; K++) {
 54          LHS_C[iV_A+K] = LHS_C[iV_A+K] + NZS_Data[iOff+K]*RHS_C[iVV_A+K];
 55       }
 56    }
 57 }
 58 // This second barrier is needed because the FGMRES threadwise splitting
    // is currently different than the above. Likely can change that
 59 #ifdef WITHOMP
 60 #pragma omp barrier
 61 #endif
 62 #ifdef WITHBGQHPM
 63    IF (MyThreadID .EQ. 1) call hpm_stop('AVE3_ApplyA') ! Stops the hardware counters
 64 #endif
 65 }