MPI users MUST now request mpiprocs in their resource specification
To get 32 cores running across 4 medium nodes
#PBS -l select=4:ncpus=8:NodeType=medium:mpiprocs=8
To get 24 cores running across 4 extra large nodes with only 6 MPI threads per node and without sharing the nodes with other jobs you could use
#PBS -l select=2:ncpus=12:NodeType=xl:mpiprocs=6
Intel MPI Usage
compiler to be used
binaries needs to be compiled using mpi compilers.
We have intel mpi and sgi mpi.
intel mpi
####For intel mpi, use the following syntax####
#module load intel/2019up5/mpi
module load intel-fc-11/
module load intel-mpi/
module load mpt/2.02
mpicc mpiexec mpif90 mpigcc mpiicc mpiifort mpitune mpicxx mpiexec.hydra mpif77 mpifc mpigxx mpiicpc mpirun.actual mpivars.csh
sample code
cat p1.c
/* Sum processor numbers by passing them around in a ring. */ #include "mpi.h" #include <stdio.h> /* Set up communication tags (these can be anything) */ #define to_right 201 #define to_left 102 void main (int argc, char *argv[]) { int ierror, value, new_value, procnum, numprocs; int right, left, other, sum, i; MPI_Status recv_status; /* Initialize MPI */ MPI_Init(&argc, &argv); /* Find out this processor number */ MPI_Comm_rank(MPI_COMM_WORLD, &procnum); /* Find out the number of processors */ MPI_Comm_size(MPI_COMM_WORLD, &numprocs); /* Compute rank (number) of processors to the left and right */ right = procnum + 1; if (right == numprocs) right = 0; left = procnum - 1; if (left == -1) left = numprocs-1; sum = 0; value = procnum; for( i = 0; i < numprocs; i++) { /* Send to the right */ MPI_Send(&value, 1, MPI_INT, right, to_right, MPI_COMM_WORLD); /* Receive from the left */ MPI_Recv(&new_value, 1, MPI_INT, left, to_right, MPI_COMM_WORLD, &recv_status); /* Sum the new value */ sum = sum + new_value; /* Update the value to be passed */ value = new_value; /* Print out the partial sums at each step */ printf ("PE %d:\t Partial sum = %d\n", procnum, sum); } /* Print out the final result */ if (procnum == 0) { printf ("Sum of all processor numbers = %d\n", sum); } /* Shut down MPI */ MPI_Finalize(); return; }
Use mpiicc, mpiifort, or mpiicpc for compilation
module load intel-fc-11/
module load intel-mpi/
module load mpi/intel-4.0
mpicc -o p1.out p1.c -lmpi
ls /export/home/snumber/pbs/mpi
mpd.hosts p1.c p1.out
running the job as an mpi job
Manual test run
Please make sure the mpd.hosts file exist. There is a sample mpd.hosts file further down this page . It is only needed for a manual run. It need not be used when the script is run using PBS , either interactively or in batch.
The following syntax will run it on 2 nodes called by the option "-n 2")
mpirun -r ssh -f /export/home/snumber/pbs/mpi/mpd.hosts -n 2 /export/home/snumber/pbs/mpi/p1.out PE 0: Partial sum = 1 PE 0: Partial sum = 1 Sum of all processor numbers = 1 PE 1: Partial sum = 0 PE 1: Partial sum = 1
pbs run
cat pbs.run1
#!/bin/bash -l #PBS -m abe #PBS -V ### Mail to user #PBS -M ### Job name #PBS -N mpi_test #PBS -l walltime=01:00:00 ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=4:mpiprocs=4 NPROCS=8 source $HOME/.bashrc ##module load intel-fc-11/ module load intel-mpi/ ##module load intel-cc-11/11.1.072 ##module load intel-mpi/ ##export I_MPI_DEBUG=100 export I_MPI_PLATFORM=auto PBS_EXEC=/opt/pbs/default export I_MPI_MPD_RSH=ssh echo "This jobs runs on the following processors:" echo `cat $PBS_NODEFILE` mpirun -n $NPROCS /export/home/s2594054/pbs/mpi/intelMPI/p1.out 2>&1 echo "Done with job"
qsub pbs.run1 828.pbsserver [snumber@n027 mpi]$ qstat Job id Name User Time Use S Queue ---------------- ---------------- ---------------- -------- - ----- 818.pbsserver 1ivf_apo s2795116 00:00:01 R workq 819.pbsserver 1nn2 s2795116 00:00:01 R workq 821.pbsserver 1ivg s2795116 00:00:00 R workq 825.pbsserver 3nss s2795116 00:00:00 R workq 826.pbsserver 1ivf_naen s2795116 00:00:00 R workq 828.pbsserver mpi s123456 00:00:00 R workq more mpi.o828 Starting job PE 8: Partial sum = 7 PE 9: Partial sum = 8 PE 9: Partial sum = 15 PE 14: Partial sum = 13 PE 3: Partial sum = 2 PE 12: Partial sum = 11 PE 0: Partial sum = 17 PE 16: Partial sum = 15 PE 5: Partial sum = 4 PE 10: Partial sum = 9 PE 10: Partial sum = 17 PE 10: Partial sum = 24 PE 11: Partial sum = 10 <snip>
Important Note
It looks like the wrapper script, mpirun, might not be working properly. mpirun wrapper script actually executes 3 other commands:
mpdboot (to start the MPD daemons), then mpiexec (to run your app), and finally mpdallexit (to close out the MPD ring). You may use the following pbs script as a workaround.
Update: This issue has been resolved by editing the and pbs_remsh files and hence it is not needed. But keeping the documentation in case it gets broken after any upgrade in the future.
#PBS -m abe #PBS -M #PBS -N romsrun-intel #PBS -l select=3:ncpus=4:mem=4g:mpiprocs=4 #source $HOME/.bashrc module load intel-fc-11/ module load intel-mpi/ module load NetCDF/3.6.3 ROMS_DIR=/export/home/s123456/modelling/roms/forecast/work HOME_DIR=/export/home/123456 ## The number of nodes is given by the select =<NUM > above NODES=3 #$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS # The number of MPI processes available is mpiprocs * nodes (=NPROCS) NPROCS=12 export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=5 echo "Starting job" date ${ROMS_DIR}/ mpdboot -n $NODES -f $PBS_NODEFILE -r ssh -v mpdtrace mpiexec -np $NPROCS ${ROMS_DIR}/oceanG ${ROMS_DIR}/ mpdallexit date echo "Done with job"
Another intel mpi example
This program calls MPI_Allgather to gather one int from every host:
/* * * Unit test for Intel MPI failure. * * mpicc -mt_mpi -o lgcGab lgcGab.c * */ #include <mpi.h> #include <stdio.h> #include <stdlib.h> int main (int argc, char** argv) { int provided; MPI_Comm comm; int rank; int size; int* sendbuf; int* recvbuf; MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided); if (provided<MPI_THREAD_MULTIPLE) { printf("Need MPI_THREAD_MULTIPLE=%d but got %d\n", MPI_THREAD_MULTIPLE,provided); MPI_Abort(MPI_COMM_WORLD,1); } MPI_Comm_dup(MPI_COMM_WORLD,&comm); comm = MPI_COMM_WORLD; MPI_Comm_rank(comm,&rank); MPI_Comm_size(comm,&size); sendbuf = (int*)malloc(sizeof(int)); recvbuf = (int*)malloc(size*sizeof(int)); sendbuf[0] = rank; //MPI_Gather(sendbuf,1,MPI_INT, recvbuf,1,MPI_INT, 0,comm); MPI_Allgather(sendbuf,1,MPI_INT, recvbuf,1,MPI_INT, comm); //MPI_Barrier(comm); MPI_Finalize(); return 0; }
mpicc -mt_mpi test.c -o test.o
#!/bin/bash -l #PBS -m abe #PBS -V ### Mail to user ####PBS -M ### Job name #PBS -N mpi_test #PBS -l walltime=100:00:00 ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=3:ncpus=2:mpiprocs=2 # This job's working directory echo Working directory is $PBS_O_WORKDIR ##cd $PBS_O_WORKDIR source $HOME/.bashrc module load intel-mpi/ module load intel-cc-11/11.1.072 echo "Starting job" echo Running on host `hostname` echo Time is `date` echo Directory is `pwd` export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 export I_MPI_PLATFORM=auto PBS_EXEC=/opt/pbs/default export PATH=${PATH}:${PBS_EXEC}/bin export PBS_RSHCOMMAND=/usr/bin/ssh NPROCS=6 #sh /sw/sdev/intel/mpi/ #echo This jobs runs on the following processors: echo `cat $PBS_NODEFILE` mpirun -n $NPROCS env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=100 env PATH=$PATH env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /export/home/s123456/pbs/mpi/intelMPI/test.o 2>&1 echo "Done with job"
mpirun -n 6 env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=100 env PATH=$PATH env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /export/home/s123456/pbs/mpi/intelMPI/test.o [0] MPI startup(): Intel(R) MPI Library, Version 4.0 Build 20100224 [0] MPI startup(): Copyright (C) 2003-2010 Intel Corporation. All rights reserved. [0] MPI startup(): Found 1 IB devices [1] MPI startup(): Found 1 IB devices [4] MPI startup(): Found 1 IB devices [5] MPI startup(): Found 1 IB devices [2] MPI startup(): Found 1 IB devices [3] MPI startup(): Found 1 IB devices [1] MPI startup(): Open 0 IB device: mlx4_0 [0] MPI startup(): Open 0 IB device: mlx4_0 [4] MPI startup(): Open 0 IB device: mlx4_0 [5] MPI startup(): Open 0 IB device: mlx4_0 [1] MPI startup(): Start 1 ports per adapter [2] MPI startup(): Open 0 IB device: mlx4_0 [3] MPI startup(): Open 0 IB device: mlx4_0 [0] MPI startup(): Start 1 ports per adapter [4] MPI startup(): Start 1 ports per adapter [5] MPI startup(): Start 1 ports per adapter [3] MPI startup(): Start 1 ports per adapter [2] MPI startup(): Start 1 ports per adapter [0] MPI startup(): shm and ofa data transfer modes [1] MPI startup(): shm and ofa data transfer modes [2] MPI startup(): shm and ofa data transfer modes [3] MPI startup(): shm and ofa data transfer modes [4] MPI startup(): shm and ofa data transfer modes [5] MPI startup(): shm and ofa data transfer modes [0] MPI startup(): I_MPI_DEBUG=100 [0] MPI startup(): I_MPI_FABRICS=shm:ofa [0] MPI startup(): set domain {0,1,2,3,4,5} fails on node n006 [1] MPI startup(): set domain {0,1,2,3,4,5} fails on node n006 [3] MPI startup(): set domain to {6,7,8,9,10,11,18,19,20,21,22,23} on node n007 [2] MPI startup(): set domain to {0,1,2,3,4,5,12,13,14,15,16,17} on node n007 [5] MPI startup(): set domain to {6,7,8,9,10,11,18,19,20,21,22,23} on node n009 [4] MPI startup(): set domain to {0,1,2,3,4,5,12,13,14,15,16,17} on node n009 [0] MPI startup(): Recognition level=2. Platform code=1. Device=8 [1] MPI startup(): Recognition level=2. Platform code=1. Device=8 [1] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [3] MPI startup(): Heterogeneous cluster. Generic defauts are selected [3] MPI startup(): Recognition level=2. Platform code=1. Device=8 [3] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [2] MPI startup(): Heterogeneous cluster. Generic defauts are selected [2] MPI startup(): Recognition level=2. Platform code=1. Device=8 [2] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [0] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [0] Allgather: 1: 0-512 & 3-16 [4] MPI startup(): Heterogeneous cluster. Generic defauts are selected [4] MPI startup(): Recognition level=2. Platform code=1. Device=8 [4] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [0] Allgather: 1: 0-2048 & 9-16 [5] MPI startup(): Heterogeneous cluster. Generic defauts are selected [5] MPI startup(): Recognition level=2. Platform code=1. Device=8 [5] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1) [0] Allgather: 1: 0-512 & 33-2147483647 [0] Allgather: 2: 0-1024 & 17-32 [0] Allgather: 3: 0-2147483647 & 0-2147483647 [0] Allgatherv: 0: 0-2147483647 & 0-2147483647 [0] Allreduce: 1: 0-1024 & 0-2147483647 [0] Allreduce: 1: 0-4096 & 0-4 [0] Allreduce: 1: 0-2048 & 0-8 [0] Allreduce: 1: 0-16384 & 33-2147483647 [0] Allreduce: 2: 1025-2097152 & 3-32 [0] Allreduce: 4: 16385-524288 & 33-2147483647 [0] Allreduce: 3: 0-2147483647 & 0-2147483647 [0] Alltoall: 1: 0-32 & 17-2147483647 [0] Alltoall: 1: 0-128 & 33-2147483647 [0] Alltoall: 2: 0-32768 & 3-32 [0] Alltoall: 2: 32768-2147483647 & 3-4 [0] Alltoall: 3: 0-262144 & 0-2 [0] Alltoall: 3: 513-2147483647 & 33-2147483647 [0] Alltoall: 4: 0-2147483647 & 0-2147483647 [0] Alltoallv: 1: 0-2147483647 & 0-32 [0] Alltoallv: 2: 0-2147483647 & 0-2147483647 [0] Alltoallw: 0: 0-2147483647 & 0-2147483647 [0] Barrier: 2: 0-2147483647 & 0-32 [0] Barrier: 4: 0-2147483647 & 0-2147483647 [0] Bcast: 1: 0-1024 & 0-8 [0] Bcast: 1: 0-8192 & 3-4 [0] Bcast: 7: 0-2147483647 & 0-2147483647 [0] Exscan: 0: 0-2147483647 & 0-2147483647 [0] Gather: 1: 0-64 & 0-2 [0] Gather: 3: 0-2147483647 & 0-2147483647 [0] Gatherv: 1: 0-2147483647 & 0-2147483647 [0] Reduce_scatter: 5: 17-32 & 9-16 [0] Reduce_scatter: 5: 17-64 & 17-32 [0] Reduce_scatter: 5: 0-128 & 33-2147483647 [0] Reduce_scatter: 1: 0-32768 & 3-2147483647 [0] Reduce_scatter: 2: 0-2147483647 & 0-2147483647 [0] Reduce: 3: 524289-2097152 & 0-2 [0] Reduce: 3: 129-8192 & 3-4 [0] Reduce: 1: 0-2147483647 & 0-2147483647 [0] Scan: 0: 0-2147483647 & 0-2147483647 [0] Scatter: 1: 0-2048 & 0-2 [0] Scatter: 3: 0-2147483647 & 0-2147483647 [0] Scatterv: 1: 0-2147483647 & 0-2 [0] Scatterv: 2: 0-2147483647 & 0-2147483647 [0] Rank Pid Node name Pin cpu [0] 0 12100 n006 n/a [0] 1 12099 n006 n/a [0] 2 32239 n007 {0,1,2,3,4,5,12,13,14,15,16,17} [0] 3 32240 n007 {6,7,8,9,10,11,18,19,20,21,22,23} [0] 4 3951 n009 {0,1,2,3,4,5,12,13,14,15,16,17} [0] 5 3952 n009 {6,7,8,9,10,11,18,19,20,21,22,23}
benchmarking intel mpi
module load intel-mpi/ module load intel-cc-11/11.1.072 NPROCS=4 mpirun -n $NPROCS env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=5 env PATH=$PATH env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /sw/sdev/intel/mpi/
openMPI usage
Please note that the syntax for openmpi's mpirun is a little different from intel-mpi's mpirun. Please see a samplle below.
module load mpi/openMPI/1.4.3-gnu ###For openmpi, use the following syntax#### #module load mpi/openmpi/4.0.2 #mpiexec $PROGRAM NAME < $INPUT FILE >& $OUTPUT FILE
Sample run
openMPI has a different mechanism to pass the local environmental variables from master node to slave nodes. You have three options:
1. source ~/.bashrc
2. Use mpirun's --prefix command line option (described below).
3. Modify the wrapper compilers to include directives to include run-time search locations for the Open MPI libraries (see this FAQ entry)
mpirun's --prefix command line option takes as an argument the top-level directory where Open MPI was installed. While relative directory names are possible, they can become ambiguous depending on the job launcher used; using absolute directory names are strongly recommended.
For example, say that Open MPI was installed into /sw/openMPI/1.4.3-gnu/. You would use the --prefix option like this:
shell$ mpirun --prefix /sw/openMPI/1.4.3-gnu/ -np 4 a.out
This will prefix the PATH and LD_LIBRARY_PATH on both the local and remote hosts with /sw/openMPI/1.4.3-gnu/bin and /sw/openMPI/1.4.3-gnu/lib, respectively.
Beginning with the 1.2 series, it is possible to make this the default behavior by passing to configure the flag --enable-mpirun-prefix-by-default. This will make mpirun behave exactly the same as "mpirun --prefix $prefix ...", where $prefix is the value given to --prefix in configure.
Finally, note that specifying the absolute pathname to mpirun is equivalent to using the --prefix argument. For example, the following is equivalent to the above command line that uses --prefix:
/sw/openMPI/1.4.3-gnu/bin/mpirun -np 4 a.out
#PBS -m e #PBS -M <YourEmail> #PBS -N Test #PBS -l select=6:ncpus=4:mem=12g:mpiprocs=4 source $HOME/.bashrc module load swan/mpi/4085-gnu ## The number of nodes is given by the select =<NUM > above NODES=6 ###$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS ###### The number of MPI processes available is mpiprocs * nodes (=NPROCS) NPROCS=24 echo "Starting job" /sw/openMPI/1.4.3-gnu/bin/mpirun -machinefile $PBS_NODEFILE -np $NPROCS env PATH=$PATH env LD_LIBRARY_PATH=$LD_LIBRARY_PATH swanrun -input ${SWAN_FORECAST_WORK}/swan_eau_forecast.swn # echo "Done with job" file
All compute Nodes
cat mpd.hosts
GPU nodes
cat mpd.hosts
SGI mpi - SGI-mpt
Sample code
cat pingpong.c
#include <stdio.h> #include "mpi.h" #define ONE_MIL (1000000) #define MAX_SIZE_LOG (21) #define MAX_SIZE (1<<MAX_SIZE_LOG) char sdata[MAX_SIZE]; char rdata[MAX_SIZE]; main( int argc, char **argv ) { int rank, i, size, j, next, prev; long rounds, msglen; MPI_Status status; double startt,endt; char buf[128]; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); if (rank == 0) { printf("STARTING LATENCY AND BANDWIDTH BENCHMARK\n"); fflush(stdout); } next = (rank + 1 ) % size; prev = (rank + size - 1 ) % size; msglen = 0; while (msglen <= MAX_SIZE) { if (msglen <= (1<<9)) rounds = 1 << 15; else if (msglen <= (1<<12)) rounds = 1 << 12; else if (msglen <= (1<<18)) rounds = 1 << 10; else rounds = 1 << 8; MPI_Barrier ( MPI_COMM_WORLD ); startt = MPI_Wtime(); for (j=rounds; j--; ) { if (rank == 0) { MPI_Send(sdata,msglen,MPI_BYTE,next,0,MPI_COMM_WORLD); MPI_Recv(rdata,msglen,MPI_BYTE,prev,0,MPI_COMM_WORLD,&status); } else { MPI_Recv(rdata,msglen,MPI_BYTE,prev,0,MPI_COMM_WORLD,&status); MPI_Send(sdata,msglen,MPI_BYTE,next,0,MPI_COMM_WORLD); } } if (rank == 0) { double latency,bandwidth, elapse; endt = MPI_Wtime(); elapse = endt - startt; latency = elapse / (double)(size*rounds) * (double)ONE_MIL; bandwidth = ((double)(size*rounds*msglen))/elapse/(double)ONE_MIL; printf("%3d %8.3f %8.3f\n",(int)msglen,latency,bandwidth); fflush(stdout); } if (msglen == 0) msglen = 1; else msglen = 2 * msglen; } MPI_Finalize(); }
module avail module load mpt module list Currently Loaded Modulefiles: 1) mpt/2.04
Compile the program
gcc -o pingpong_mpt pingpong.c -lmpi
sample run - masternode
mpirun 2 ./pingpong_mpt cat mpd.hosts n005 n006 mpirun -v n005,n006 2 ./pingpong_mpt mpirun -v n003,n004 2 ./pingpong_mpt MPI: 'SGI MPT 2.04 03/15/11 05:36:27' MPI: 'SGI MPT 2.04 03/15/11 05:34:18' MPI: 'SGI MPT 2.04 03/15/11 05:34:18' MPI: Cluster collective optimizations enabled. MPI Environmental Settings MPI: MPI_VERBOSE (default: 0) : 1 MPI: Using the InfiniBand interconnect. STARTING LATENCY AND BANDWIDTH BENCHMARK 0 3.770 0.000 1 3.147 0.318 2 3.120 0.641 4 3.126 1.279 8 3.141 2.547 16 3.185 5.024 32 3.213 9.959 64 3.603 17.765 128 3.891 32.896 256 4.243 60.342 512 4.937 103.709
More Examples
cat hello_mpi.c
/*The Parallel Hello World Program*/ #include <stdio.h> #include <mpi.h> main(int argc, char **argv) { int node; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &node); printf("Hello World from Node %d\n",node); MPI_Finalize(); }
To compile this:
module load intel-fc-11/ module load intel-mpi/ module load mpi/intel-4.0 mpicc -o hello_mpi hello_mpi.c -lmpi
Sample PBS script
cat pbs.run7
#!/bin/bash -l #PBS -m abe ### Mail to user #PBS -M <yourEmail> ### Job name #PBS -N mpi #PBS -l walltime=60:00:00 ## Please note the walltime above . This value * must * be set so that if the ## MPI program runs in an infinite loop , or something similar , it will be ## killed after the given wall time . ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=12:mpiprocs=7 ## The number of nodes is given by the select =<NUM > above NODES=2 ##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS ### The number of MPI processes available is mpiprocs * nodes NPROCS=14 # This job's working directory echo "Working directory is $PBS_O_WORKDIR" cd $PBS_O_WORKDIR source $HOME/.bashrc module load intel-fc-11/ module load intel-mpi/ module load mpi/intel-4.0 echo "Starting job" echo Running on host `hostname` echo Time is `date` echo Directory is `pwd` #echo This jobs runs on the following processors: echo `cat $PBS_NODEFILE` mpirun -f $PBS_NODEFILE -n "$NODES" -r ssh -n "$NPROCS" /export/home/snumber/pbs/mpi/2/hello_mpi echo "Done with job"
qsub pbs.run7 2659.pbsserver qstat 2659.pbsserver mpi s123456 00:00:00 R workq cat mpi.o2659 Working directory is /export/home/SNUMBER/pbs/mpi/2 Starting job Running on host n010 Time is Wed Jul 27 08:37:14 EST 2011 Directory is /export/home/SNUMBER/pbs/mpi/2 n010 n010 n010 n010 n010 n010 n010 n020 n020 n020 n020 n020 n020 n020 Hello World from Node 0 Hello World from Node 2 Hello World from Node 4 Hello World from Node 6 Hello World from Node 3 Hello World from Node 1 Hello World from Node 5 Hello World from Node 9 Hello World from Node 11 Hello World from Node 7 Hello World from Node 13 Hello World from Node 8 Hello World from Node 10 Hello World from Node 12 Done with job
qsub -I pbs.run7 cd $PBS_O_WORKDIR module load intel-fc-11/ module load intel-mpi/ module load mpi/intel-4.0 mpirun -f $PBS_NODEFILE -n 2 -r ssh -n 14 /export/home/SNUMBER/pbs/mpi/2/hello_mpi Hello World from Node 0 Hello World from Node 2 Hello World from Node 1 Hello World from Node 4 Hello World from Node 6 Hello World from Node 3 Hello World from Node 8 Hello World from Node 5 Hello World from Node 13 Hello World from Node 7 Hello World from Node 12 Hello World from Node 10 Hello World from Node 11 Hello World from Node 9 OR mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np $(wc -l $PBS_NODEFILE | gawk '{print $1}') /export/home/snumber/pbs/mpi/2/hello_mpi running mpdallexit on n016 LAUNCHED mpd on n016 via RUNNING: mpd on n016 LAUNCHED mpd on n020 via n016 RUNNING: mpd on n020 3: Hello World from Node 3 2: Hello World from Node 2 0: Hello World from Node 0 6: Hello World from Node 6 4: Hello World from Node 4 1: Hello World from Node 1 5: Hello World from Node 5 7: Hello World from Node 7 11: Hello World from Node 11 12: Hello World from Node 12 10: Hello World from Node 10 13: Hello World from Node 13 8: Hello World from Node 8 9: Hello World from Node 9
The following could work as well
mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np 8 /export/home/SNUMBER/pbs/mpi/p1.out mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np $(wc -l $PBS_NODEFILE | gawk '{print $1}') /export/home/SNUMBER/pbs/mpi/p1.out --totalnum specifies the total number of mpds to start -np number - number of processes -n <n> or -np <n> # number of processes to start mpirun -f $PBS_NODEFILE -n $(cat $PBS_NODEFILE | gawk '{print $1}'|sort|uniq|wc -l) -r ssh -n $(wc -l $PBS_NODEFILE | gawk '{print $1}') /export/home/SNUMBER/pbs/mpi/p1.out To check if mpd is working well: mpdcheck -f $PBS_NODEFILE -v
PBS select line
#PBS -l select=2:ncpus=8:mpiprocs=8
The line "-l select=2:ncpus=8:mpiprocs=8" is the number of processors required for the mpi job. "select" specifies the number of nodes required; "ncpus" indicates the number of CPUs per node required; and "mpiprocs" represents the number of mpi processes to run per node (normally ncpus=mpiprocs).
As this is not the most intuitive command, the following table is provided as guidance as to how this command works:
select | ncpus | mpiprocs | description |
4 | 8 | 8 | 32 Processor job, using 4 nodes and 8 processors per node |
4 | 4 | 4 | 16 Processor job, using 4 nodes and 4 processors per node |
16 | 1 | 1 | 16 Processor job, using 16 nodes running 1 mpi process per processor and utilising 1 processor per node |
16 | 8 | 8 | 128 Processor job, using 16 nodes and 8 processors per node (each running an mpi process) |
PS: tight integration of intelMPI and PBS
For tight integration of intel MPI and PBS, the following needed to be done:
#1 ==> On all execution nodes, add the following: --> vi /opt/pbs/default/bin/pbs_remsh while [ $# -gt 1 ]; do if [ "XX$1" = "XX-j" ]; then shift; jobid=$1 shift; elif [ "XX$1" = "XX-r" ]; then shift; rshcmd=$1 shift; #Add the following 2 lines here. elif [ "XX$1" = "XX-n" ]; then shift; else break; fi done #2 ==> vi /sw/sdev/intel/mpi/ (search for ssh and find an entry similar to this) else: if rshCmd == 'ssh': #rshArgs = '-x -n -q' #Added the following entry rshArgs = '-x -n -q' elif rshCmd == 'pbs_tmrsh': rshArgs = '' elif rshCmd == 'pbs_remsh': rshArgs = '' else: rshArgs = '' mpdHost = hostsAndInfo[idxToStart]['host'] #3 Sample PBS script >>>>>>>>>>>>>>>>>>>>>>> #!/bin/bash -l #PBS -m abe #PBS -V #PBS -M Job name #PBS -N Intel_mpi_test #PBS -l walltime=100:00:00 ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=4:mpiprocs=4 NPROCS=8 #cd $PBS_O_WORKDIR source $HOME/.bashrc module load intel-mpi/ module load intel-cc-11/11.1.072 ##export I_MPI_MPD_RSH=pbs_remsh export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 export I_MPI_PLATFORM=auto PBS_EXEC=/opt/pbs/default export PATH=${PATH}:${PBS_EXEC}/bin mpirun -n $NPROCS /bin/hostname 2>&1 >>>>>>>>>>>>>>>>>>>>>>> # 4 : Further torubleshooting needed? If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh -----At the beginning -------- #!/bin/sh exec >/tmp/remsh.debug 2>&1 set -x --------------------- At the very bottom, add the following logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*" $remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $* ---------------------- Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues Add this entry into : /sw/sdev/intel/mpi/ #!/bin/sh exec >/tmp/remsh.txt 2>&1 set -x ------------------- tail /tmp/remsh.txt n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/ -h n022 -p 52589 --ifhn= --ncpus=1 --myhost=n023 --myip= -e -d -s 2 Run it like this to check: pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/ -h n022 -p 52589 --ifhn= --ncpus=1 --myhost=n023 --myip= -e -d -s 2 tail /tmp/remsh.debug + PBS_EXEC=/opt/pbs/default/bin + logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/ -h n022 -p 35698 --ifhn= --ncpus=1 --myhost=n023 --myip= -e -d -s 2' + /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/ -h n022 -p 35698 --ifhn= --ncpus=1 --myhost=n023 --myip= -e -d -s 2 ssh: illegal option -- j usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec] [-D [bind_address:]port] [-e escape_char] [-F configfile] [-i identity_file] [-L [bind_address:]port:host:hostport] [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port] [-R [bind_address:]port:host:hostport] [-S ctl_path] [-w local_tun[:remote_tun]] [user@]hostname [command] pbs_remsh n023 /bin/hostname mpirun -r pbs_remsh hostname mpirun -r pbs_remsh hostnam mpirun -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/ -r pbs_remsh hostname mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh sh -x /sw/sdev/intel/mpi/ -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/ -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221 export MPD_CON_EXT=214742.pbsserver_8221 #5: Try submitting an interactive job qsub -I -l select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00 On the execution node: export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 NPROCS=12 module load intel-cc-11/11.1.072 module load intel-mpi/ mpirun -n $NPROCS /bin/hostname
An example from openmpi 4
hello_mpi.c /*The Parallel Hello World Program*/ #include <stdio.h> #include <mpi.h> main(int argc, char **argv) { int node; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &node); printf("Hello World from Node %d\n",node); MPI_Finalize(); } To compile this: module load mpi/openmpi/4.0.2 mpicc -o hello_mpi hello_mpi.c -lmpi ##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi
#!/bin/bash ###PBS -m abe ### Mail to user ##PBS -M <yourEmail> ### Job name #PBS -N mpi ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=2:mpiprocs=2:mem=1g,walltime=10:00:00 ## The number of chunks is given by the select =<NUM > above CHUNKS=2 ##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS ### The number of MPI processes available is mpiprocs * CHUNKS NPROCS=4 # This job's working directory echo "Working directory is $PBS_O_WORKDIR" cd $PBS_O_WORKDIR source $HOME/.bashrc module load mpi/openmpi/4.0.2 echo "Starting job" echo Running on host `hostname` echo Time is `date` echo Directory is `pwd` #echo This jobs runs on the following processors: echo `cat $PBS_NODEFILE` mpirun -machinefile $PBS_NODEFILE env $NPROCS hello_mpi echo "Done with job"