//============================================================================== // 7-point stencil benchmark designed to highligh key optimizations // compile with: // cc -Ofast -qopenmp stencil.c -no-ipo -fno-inline-functions -g // run (1P of a 2P HSW Cori node): // # n.b. SLURM on Cori/HSW defines a 'cpu' as a HW thread. Thus 32 CPUs == 16 cores. // export OMP_NUM_THREADS=16 // export KMP_AFFINITY=verbose,granularity=thread,compact,1 // srun -n1 --cpus-per-task=32 ./a.out // //============================================================================== #include #include #include #include //============================================================================== #define DIM 512 #define TIME 10 //============================================================================== void bench_stencil_ver0(double * __restrict__ X, double * __restrict__ Y, int dim, int iStride, int jStride, int kStride){ double StartTime, ElapsedTime=0; int nIterations=0; int i,j,k; double * __restrict__ old = X; double * __restrict__ new = Y; double * __restrict__ temp; StartTime = omp_get_wtime(); while(ElapsedTime < TIME){ #pragma omp parallel for for(k=1;k>3);i++){ X[i] = 0.0; Y[i] = 0.0; } //---------------------------------------------------------------------------- bench_stencil_ver0(X,Y,DIM,1,(DIM+2),(DIM+2)*(DIM+2)); bench_stencil_ver1(X,Y,DIM,1,(DIM+2),(DIM+2)*(DIM+2)); bench_stencil_ver2(X,Y,DIM,1,(DIM+2),(DIM+2)*(DIM+2)); X=X+7; // make first non-ghost zone 64B aligned Y=Y+7; bench_stencil_ver3(X,Y,DIM,1,(DIM+8),(DIM+8)*(DIM+2)); // pad unit-stride == lower AI but better SIMD bench_stencil_ver4(X,Y,DIM,1,(DIM+8),(DIM+8)*(DIM+2)); }