SNBone

From README.txt:

This program emulates the inversion of A in A*x=S on a single node arch. This operation was identified as the bulk of the flop work and the largest memory issue for deterministic neutronics

This mini-app has three solution modes:

  • The AVE-1 scheme refers to the standard matrix-free matrix-vector applications where gaussian integration is used
  • The AVE-2 scheme refers to a slightly different approach from AVE-1 where intermediate vectors are used.
  • The AVE-3 scheme refers to a conventional assembled matrix in compressed sparse row storage.

To date, AVE-3 consistently outperforms AVE-1 and AVE-2 such that fully assembling the matrix and performing the solve is more efficient than doing the gaussian integration solve.


Build and Problem Configuration

From README.txt:

The code is broken into three steps.

  1. You create an unstructured mesh but it has a regular stencil such that the bandwidth is not as severe as a conventional FE mesh. You can theoretically provide a linear tetrahedral finite element mesh from a conventional mesh generator as an alternative. Ask for details.
  2. You process this mesh with respect to the maximum number of threads you want to investigate
    The mesh will be re-ordered with respect to element and vertex such that it can be applied in n*thread independent steps.

  3. You run the mini-app (fortran or c version) to look at the overall performance on a particular architecture

Analysis


Build and Run Information

Compiler = icpc (ICC) 18.0.1 20171018
Build_Flags = -g -O3 -march=native -DUSEMETIS
Run_Parameters = 0 100 30 32 1

Setup Parameters

makemesh.x 10 10 10 0
processmesh.x 1 1

Run with 1 Thread on 1 Node


Intel Software Development Emulator

SDE Metrics
SNbone
Arithmetic Intensity 0.13
Bytes per Load Inst 7.98
Bytes per Store Inst 6.95

Roofline – Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz


Experiment Aggregate Metrics

IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
2.10 1.13 1.20 0.27% 30.60% 24.30% 1.26% 1.33% 0.30%

ApplyA_AVE1_Tet_SUPG

 35 // Main subroutine header
 36 void ApplyA_AVE1_Tet_SUPG(int *NE, int *NA, int *NV, int *C,
 37                           int *AS_NumColors,int *AS_NumThreads,
                              int *TasksPerThread,int *MyThreadID,
                              int *ThreadWiseWork,
 38                           double *CF, double *CU, double *CP,
                              double *FES, double *FED, double *FEW,
                              double *OM, double *OO,
 39                           double *LHS, double *RHS) {
 40 // Problem size arguments and mesh data
 41 //INTEGER,     INTENT(IN) :: NE           // Number of Es
 42 //INTEGER,     INTENT(IN) :: NA           // Number of angles
 43 //INTEGER,     INTENT(IN) :: NV           // Number of vertices in the mesh
 44 //INTEGER,     INTENT(IN) :: C(4*NE)      // The connectivity matrix
 45 // Thread size arguments and work seperation
 46 //INTEGER,     INTENT(IN) :: AS_NumColors,AS_NumThreads,TasksPerThread,MyThreadID
 47 //INTEGER,     INTENT(IN) :: ThreadWiseWork(2,AS_NumColors,AS_NumThreads) 
                                 ! The element start/stop for each thread in each color
 48 // Cross section vectors
 49 //REAL*8,    INTENT(IN) :: CF(NE)
 50 //REAL*8,    INTENT(IN) :: CU(NE)
 51 //REAL*8,    INTENT(IN) :: CP(NE)
 52 // FE shape functions
 53 //REAL*8, INTENT(IN) :: FES(4*4)      // Vertices,GP     // Shape
 54 //REAL*8, INTENT(IN) :: FED(4*3*4*NE) // Vertices,Dim,GP 
                                          // Derivatives in real space
 55 //REAL*8, INTENT(IN) :: FEW(4*NE)     // GP              // Weight * |Jac|
 56 // Angular cubature
 57 //REAL*8, INTENT(IN) :: OM(NA*3)              // Angular cubature evaluation
 58 //REAL*8, INTENT(IN) :: OO(NA*6)              // Basically it is OM(A,*) * OM(A,*)
 59 // Vectors
 60 //REAL*8, INTENT(INOUT) :: LHS(NA*NV) // LHS (to be modified)
 61 //REAL*8,    INTENT(IN) :: RHS(NA*NV) // RHS (to be multiplied)
 62
 63 // Local
 64 int E,A,V,VV,GP,iNA;
 65 int iCV_E,iCVV_E,iGP,iEGP,iEGP1,iEGP2,iEGP3;
 66 int iColor,iStart,iEnd;
 67 int AS_Thread,iTask,ifun;
 68
 69 iNA = *NA;
 70 //printf("NA=%d NE=%d NV=%d \n",*NA,*NE,*NV);
 71
 72 for (iColor = 1; iColor < *AS_NumColors+1; iColor++) {
 73 #ifdef WITHOMP
    {...}
 81 #else
 82    iStart = 1;
 83    iEnd   = *NE;
 84 #endif
 85    for (E = iStart; E < iEnd+1; E++) {
 86       for (V = 1; V < 5; V++) {
 87          //printf("E=%d V=%d ii=%d \n",E,V,(E-1)*4+V-1);
 88          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1;
 89          for (GP = 1; GP < 5; GP++) {
 90             iGP  = (GP-1)*4-1;
 91             iEGP  = (E-1)*4+GP-1;
 92             iEGP1 = (E-1)*4*3*4 + (GP-1)*3*4 + (1-1)*4-1;
 93             iEGP2 = (E-1)*4*3*4 + (GP-1)*3*4 + (2-1)*4-1;
 94             iEGP3 = (E-1)*4*3*4 + (GP-1)*3*4 + (3-1)*4-1;
 95             for (VV = 1; VV < 5; VV++) {
 96                iCVV_E = (C[(E-1)*4+VV-1]-1)*iNA-1;
Time
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
51.4% 2.09 1.48 1.52 0.05% 22.54% 38.01% 0.27% 0.18% 0.01%
 97                for (A = 1; A < iNA+1; A++) {
                      /*...Commented Out Code...}*/
104                // There are 4+15+45*GP*V*V=64*4*4*4=4096 mults per element-angle
                   // and 13*GP*V*V=832 adds so 4928 flops
105                   LHS[iCV_E+A] = LHS[iCV_E+A]
106                                 + FEW[iEGP]*CF[E-1] *FES[iGP+V]  
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // F
107                                 + FEW[iEGP]*CU[E-1]*OM[A-1] *FED[iEGP1+V]
                                     *FES[iGP+VV] *RHS[iCVV_E+A]    // U^T[1]
108                                 + FEW[iEGP]*CU[E-1]*OM[iNA+A-1]  *FED[iEGP2+V]
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // U^T[2]
109                                 + FEW[iEGP]*CU[E-1]*OM[2*iNA+A-1]*FED[iEGP3+V]
                                     *FES[iGP+VV]  *RHS[iCVV_E+A]    // U^T[3]
110                                 + FEW[iEGP]*CP[E-1]*OO[A-1]      *FED[iEGP1+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[1,1]
111                                 + FEW[iEGP]*CP[E-1]*OO[iNA+A-1]  *FED[iEGP2+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A]    // P[2,2]
112                                 + FEW[iEGP]*CP[E-1]*OO[2*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[3,3]
113                                 + FEW[iEGP]*CP[E-1]*OO[3*iNA+A-1]*FED[iEGP1+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A]    // P[1,2]
114                                 + FEW[iEGP]*CP[E-1]*OO[4*iNA+A-1]*FED[iEGP1+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[1,3]
115                                 + FEW[iEGP]*CP[E-1]*OO[5*iNA+A-1]*FED[iEGP2+V]
                                     *FED[iEGP3+VV]*RHS[iCVV_E+A]    // P[2,3]
116                                 + FEW[iEGP]*CP[E-1]*OO[3*iNA+A-1]*FED[iEGP2+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[2,1]
117                                 + FEW[iEGP]*CP[E-1]*OO[4*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP1+VV]*RHS[iCVV_E+A]    // P[3,1]
118                                 + FEW[iEGP]*CP[E-1]*OO[5*iNA+A-1]*FED[iEGP3+V]
                                     *FED[iEGP2+VV]*RHS[iCVV_E+A] ;  // P[3,2]
                      /*...Commented Out Code...*/
132                } // A
133             } // VV
134          } // GP
135       } // V
136    } // E
137 #ifdef WITHOMP
138  } // iTask
139 #pragma omp barrier
140 #else
141    break;
142 #endif
143 } // iColor
144
145 } // END SUBROUTINE ApplyA_AVE1_Tet_SUPG

ApplyA_AVE2_Tet_SUPG

 35 // Main subroutine header
 36 void ApplyA_AVE2_Tet_SUPG(int *NE, int *NA, int *NV, int *C,
 37                           int *AS_NumColors,int *AS_NumThreads,int *TasksPerThread,
                              int *MyThreadID,int *ThreadWiseWork,
 38                           double *CF, double *CU, double *CP, double *FES,
                              double *FED, double *FEW, double *OM, double *OO,
 39                           double *LHS, double *RHS,double *SRHS,double *SLHS) {
 40 //IMPLICIT NONE
 41 //#include "PROTEUS_Preprocess.h"
 42 //#define Local_TryLHStoo
 43 // Problem size arguments and mesh data
 44 //PROTEUS_Int,     INTENT(IN) :: NE            // Number of Es
 45 //PROTEUS_Int,     INTENT(IN) :: NA            // Number of angles
 46 //PROTEUS_Int,     INTENT(IN) :: NV            // Number of vertices in the mesh
 47 //PROTEUS_Int,     INTENT(IN) :: C(4,NE)       // The connectivity matrix
 48 //// Thread size arguments and work seperation
 49 //PROTEUS_Int,     INTENT(IN) :: AS_NumColors,AS_NumThreads,TasksPerThread,MyThreadID
 50 //PROTEUS_Int,     INTENT(IN) :: ThreadWiseWork(2,AS_NumColors,AS_NumThreads) 
                                     // The element start/stop for each thread in each color
 51 //// Cross section vectors
 52 //PROTEUS_Real,    INTENT(IN) :: CF(NE)
 53 //PROTEUS_Real,    INTENT(IN) :: CU(NE)
 54 //PROTEUS_Real,    INTENT(IN) :: CP(NE)
 55 //// FE shape functions
 56 //PROTEUS_FE_Real, INTENT(IN) :: FES(4,4)      // Vertices,GP     // Shape
 57 //PROTEUS_FE_Real, INTENT(IN) :: FED(4,3,4,NE) // Vertices,Dim,GP 
                                                   // Derivatives in real space
 58 //PROTEUS_FE_Real, INTENT(IN) :: FEW(4,NE)     // GP              // Weight * |Jac|
 59 //// Angular cubature
 60 //PROTEUS_FE_Real, INTENT(IN) :: OM(NA,3)         // Angular cubature evaluation
 61 //PROTEUS_FE_Real, INTENT(IN) :: OO(NA,6)         // Basically it is OM(A,*) * OM(A,*)
 62 //// Vectors
 63 //PROTEUS_Real, INTENT(INOUT) :: LHS(NA,NV) // LHS (to be modified)
 64 //PROTEUS_Real,    INTENT(IN) :: RHS(NA,NV) // RHS (to be multiplied)
 65 //// Scratch vectors
 66 //PROTEUS_Real, INTENT(INOUT) :: SRHS(NA,4),SLHS(NA,4)
 67
 68 // Local
 69 int E,A,V,VV,GP,iNA;
 70 int iCV_E,iCVV_E,iGP,iEGP,iEGP1,iEGP2,iEGP3;
 71 int iColor,iStart,iEnd;
 72 int AS_Thread,iTask,ifun;
 73 double LocalConst1,LocalConst2,LocalConst3,Products[11];
 74
 75 iNA = *NA;
 76
 77 //printf("NA=%d NE=%d NV=%d \n",*NA,*NE,*NV);
 78
 79 for (iColor = 1; iColor < *AS_NumColors+1; iColor++) {
 80 #ifdef WITHOMP
 81  for (iTask = 1; iTask < *TasksPerThread+1; iTask++) {
 82    AS_Thread = *TasksPerThread * (*MyThreadID-1) + iTask;
 83    ifun = *AS_NumColors * (AS_Thread-1)*2 + 2*(iColor-1);
 84    iStart   = ThreadWiseWork[ifun+1-1];
 85    iEnd     = ThreadWiseWork[ifun+2-1];
 86 //   iStart   = ThreadWiseWork[(iColor-1)*2+1-1];
 87 //   iEnd     = ThreadWiseWork[(iColor-1)*2+2-1];
 88 #else
 89    iStart = 1;
 90    iEnd   = *NE;
 91 #endif
 92    for (E = iStart; E < iEnd+1; E++) {
 93       for (V = 1; V < 5; V++) {
 94          // Pull a copy of the vectors
 95          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1;
 96          for (A = 1; A < iNA+1; A++) {
 97             SRHS[(V-1)*iNA+A-1] = RHS[iCV_E + A];
 98 #ifdef Local_TryLHStoo
 99             SLHS[(V-1)*iNA+A-1] = LHS[iCV_E + A];
100 #endif
101          }
102       }
103       for (V = 1; V < 5; V++) {
104 #ifndef Local_TryLHStoo
105          iCV_E = (C[(E-1)*4+V-1]-1)*iNA-1; // II = C(V,E)
106 #endif
107          for (GP = 1; GP < 5; GP++) {
108             iGP  = (GP-1)*4-1;
109             iEGP  = (E-1)*4+GP-1;
110             iEGP1 = (E-1)*4*3*4 + (GP-1)*3*4 + (1-1)*4-1;
111             iEGP2 = (E-1)*4*3*4 + (GP-1)*3*4 + (2-1)*4-1;
112             iEGP3 = (E-1)*4*3*4 + (GP-1)*3*4 + (3-1)*4-1;
113
114             LocalConst1 = FEW[iEGP]*CF[E-1];
115             LocalConst2 = FEW[iEGP]*CU[E-1];
116             LocalConst3 = FEW[iEGP]*CP[E-1];
117             for (VV = 1; VV < 5; VV++) {
118                iCVV_E = (VV-1)*iNA-1; //(C[(E-1)*4+VV-1]-1)*iNA-1;
119                Products[ 1] = LocalConst1*FES[iGP+V]  *FES[iGP+VV];
120                Products[ 2] = LocalConst2*FED[iEGP1+V]*FES[iGP+VV];
121                Products[ 3] = LocalConst2*FED[iEGP2+V]*FES[iGP+VV];
122                Products[ 4] = LocalConst2*FED[iEGP3+V]*FES[iGP+VV];
123                Products[ 5] = LocalConst3*FED[iEGP1+V]*FED[iEGP1+VV];
124                Products[ 6] = LocalConst3*FED[iEGP2+V]*FED[iEGP2+VV];
125                Products[ 7] = LocalConst3*FED[iEGP3+V]*FED[iEGP3+VV];
126                Products[ 8] = LocalConst3*(FED[iEGP1+V]*FED[iEGP2+VV]
                                  +FED[iEGP2+V]*FED[iEGP1+VV]);
127                Products[ 9] = LocalConst3*(FED[iEGP1+V]*FED[iEGP3+VV]
                                  +FED[iEGP3+V]*FED[iEGP1+VV]);
128                Products[10] = LocalConst3*(FED[iEGP2+V]*FED[iEGP3+VV]
                                  +FED[iEGP3+V]*FED[iEGP2+VV]);
Time
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
22.1% 1.95 1.09 1.21 0.07% 23.10% 31.02% 0.30% 0.29% 0.01%
129                for (A = 1; A < iNA+1; A++) {
130 #ifdef Local_TryLHStoo
131                   SLHS[iCVV_E+A] = SLHS[iCVV_E+A]
132                                 + Products[ 1]              *SRHS[iCVV_E+A]    // F
133                                 + Products[ 2]*OM[A-1]      *SRHS[iCVV_E+A]    // U^T(1)
134                                 + Products[ 3]*OM[iNA+A-1]  *SRHS[iCVV_E+A]    // U^T(2)
135                                 + Products[ 4]*OM[2*iNA+A-1]*SRHS[iCVV_E+A]    // U^T(3)
136                                 + Products[ 5]*OO[A-1]      *SRHS[iCVV_E+A]    // P(1,1)
137                                 + Products[ 6]*OO[iNA+A-1]  *SRHS[iCVV_E+A]    // P(2,2)
138                                 + Products[ 7]*OO[2*iNA+A-1]*SRHS[iCVV_E+A]    // P(3,3)
139                                 + Products[ 8]*OO[3*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,2)
140                                 + Products[ 9]*OO[4*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,3)
141                                 + Products[10]*OO[5*iNA+A-1]*SRHS[iCVV_E+A];   // P(2,3)
142 #else
143                   LHS[iCV_E+A] = LHS[iCV_E+A]
144                                 + Products[ 1]              *SRHS[iCVV_E+A]    // F
145                                 + Products[ 2]*OM[A-1]      *SRHS[iCVV_E+A]    // U^T(1)
146                                 + Products[ 3]*OM[iNA+A-1]  *SRHS[iCVV_E+A]    // U^T(2)
147                                 + Products[ 4]*OM[2*iNA+A-1]*SRHS[iCVV_E+A]    // U^T(3)
148                                 + Products[ 5]*OO[A-1]      *SRHS[iCVV_E+A]    // P(1,1)
149                                 + Products[ 6]*OO[iNA+A-1]  *SRHS[iCVV_E+A]    // P(2,2)
150                                 + Products[ 7]*OO[2*iNA+A-1]*SRHS[iCVV_E+A]    // P(3,3)
151                                 + Products[ 8]*OO[3*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,2)
152                                 + Products[ 9]*OO[4*iNA+A-1]*SRHS[iCVV_E+A]    // P(1,3)
153                                 + Products[10]*OO[5*iNA+A-1]*SRHS[iCVV_E+A];   // P(2,3)
154 #endif
                      /*...Commented Out Code...*/
165                } // A
166             } // VV
167          } // GP
168       } // V
169       // Store the LHS result
170 #ifdef Local_TryLHStoo
171       for (V = 1; V < 5; V++) {
172          iCV_E = C[(E-1)*4+V-1]-1; // VV = C(V,E)
173                for (A = 1; A < iNA+1; A++) {
174             LHS[iCV_E*iNA + A-1] = SLHS[(V-1)*iNA+A-1];
175          }
176       }
177 #endif
178    } // E
179 #ifdef WITHOMP
180  } // iTask
181 #pragma omp barrier
182 #else
183    break;
184 #endif
185 } // iColor
186
187 } // END SUBROUTINE ApplyA_AVE2_Tet_SUPG

SolveWGS_PassThrough_AVE3

 23 // Main subroutine header
 24 void SolveWGS_PassThrough_AVE3(double *RHS_C, double *LHS_C) {
 25 // double LHS_C(NumAngles*NumVertices),RHS_C(NumAngles*NumVertices)
 26 int MyThreadID;
 27 int I,J,K,iRowStart,iRowEnd,iV_A,iVV_A,iOff;
 28
 29 #ifdef WITHBGQHPM
 30    if (MyThreadID == 1) {hpm_start('AVE3_ApplyA');}
 31 #endif
 32
 33 #ifdef WITHOMP
 34    MyThreadID = omp_get_thread_num() + 1;
 35    I = NumVertices/NumThreads;
 36    iRowStart = (MyThreadID-1)*I + 1;
 37    iRowEnd   = MyThreadID*I;
 38    if (MyThreadID == NumThreads) {iRowEnd = NumVertices;}
 39 #else
 40    MyThreadID = 1;
 41    iRowStart = 1;
 42    iRowEnd   = NumVertices;
 43 #endif
 44 // This barrier ensures that the incoming vector is fully defined by all threads
 45 #ifdef WITHOMP
 46 #pragma omp barrier
 47 #endif
Time
IPC per Core
Loads per Cycle
L1 Hits per Cycle
L1 Miss Ratio
L2 Miss Ratio
L3 Miss Ratio
L2 B/W Utilized
L3 B/W Utilized
DRAM B/W Utilized
0.02% 2.56 0.82 0.95 8.89% 34.93% 23.70% 29.50% 39.51% 3.39%
 48 for (I = iRowStart; I < iRowEnd+1; I++) {
 49    for (J = NZS_RowLoc[I-1]; J < NZS_RowLoc[I]; J++) {
 50       iV_A = (I-1)*NumAngles-1;
 51       iVV_A = (NZS_ColNum[J-1]-1)*NumAngles-1;
 52       iOff  = (J-1)*NumAngles-1;
 53       for (K = 1; K < NumAngles+1; K++) {
 54          LHS_C[iV_A+K] = LHS_C[iV_A+K] + NZS_Data[iOff+K]*RHS_C[iVV_A+K];
 55       }
 56    }
 57 }
 58 // This second barrier is needed because the FGMRES threadwise splitting
    // is currently different than the above. Likely can change that
 59 #ifdef WITHOMP
 60 #pragma omp barrier
 61 #endif
 62 #ifdef WITHBGQHPM
 63    IF (MyThreadID .EQ. 1) call hpm_stop('AVE3_ApplyA') ! Stops the hardware counters
 64 #endif
 65 }