...
binaries needs to be compiled using mpi compilers.
We have intel mpi and sgi mpi.
intel mpi
=========
####For intel mpi, use the following syntax####
#module load intel/2019up5/mpi
#mpiexec -n $PROCSĀ $PROGRAM NAME < $INPUT FILE >& $OUTPUT FILE
module load intel-fc-11/12.0.2.137
module load intel-mpi/4.0.1.007
...
No Format |
---|
module load mpi/openMPI/1.4.3-gnu
###For openmpi, use the following syntax####
#module load mpi/openmpi/4.0.2
#mpiexec $PROGRAM NAME < $INPUT FILE >& $OUTPUT FILE |
Sample run
openMPI has a different mechanism to pass the local environmental variables from master node to slave nodes. You have three options:
...
As this is not the most intuitive command, the following table is provided as guidance as to how this command works:
select | ncpus | mpiprocs | description |
---|---|---|---|
4 | 8 | 8 | 32 Processor job, using 4 nodes and 8 processors per node |
4 | 4 | 4 | 16 Processor job, using 4 nodes and 4 processors per node |
16 | 1 | 1 | 16 Processor job, using 16 nodes running 1 mpi process per processor and utilising 1 processor per node |
16 | 8 | 8 | 128 Processor job, using 16 nodes and 8 processors per node (each running an mpi process) |
Ref
1. http://www.hpcu.uq.edu.au/hpc/content/view/225/34/
2. http://www.oerc.ox.ac.uk/computing-resources/osc/support/documentation-help/job-schedulers/pbs/pbs-job-submission-scripts
3. http://www.cardiff.ac.uk/arcca/services/equipment/User-Guide/user-guide.html
...
No Format |
---|
#1 ==> On all execution nodes, add the following:
--> vi /opt/pbs/default/bin/pbs_remsh
while [ $# -gt 1 ]; do
if [ "XX$1" = "XX-j" ]; then
shift;
jobid=$1
shift;
elif [ "XX$1" = "XX-r" ]; then
shift;
rshcmd=$1
shift;
#Add the following 2 lines here.
elif [ "XX$1" = "XX-n" ]; then
shift;
else
break;
fi
done
#2 ==> vi /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py
(search for ssh and find an entry similar to this)
else:
if rshCmd == 'ssh':
#rshArgs = '-x -n -q'
#Added the following entry
rshArgs = '-x -n -q'
elif rshCmd == 'pbs_tmrsh':
rshArgs = ''
elif rshCmd == 'pbs_remsh':
rshArgs = ''
else:
rshArgs = ''
mpdHost = hostsAndInfo[idxToStart]['host']
#3 Sample PBS script
>>>>>>>>>>>>>>>>>>>>>>>
#!/bin/bash -l
#PBS -m abe
#PBS -V
#PBS -M YOUREMAIL@griffith.edu.au
Job name
#PBS -N Intel_mpi_test
#PBS -l walltime=100:00:00
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=4:mpiprocs=4
NPROCS=8
#cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load intel-mpi/4.0.0.027
module load intel-cc-11/11.1.072
##export I_MPI_MPD_RSH=pbs_remsh
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
export I_MPI_PLATFORM=auto
PBS_EXEC=/opt/pbs/default
export PATH=${PATH}:${PBS_EXEC}/bin
mpirun -n $NPROCS /bin/hostname 2>&1
>>>>>>>>>>>>>>>>>>>>>>>
# 4 : Further torubleshooting needed?
If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh
-----At the beginning --------
#!/bin/sh
exec >/tmp/remsh.debug 2>&1
set -x
---------------------
At the very bottom, add the following
logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*"
$remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $*
----------------------
Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues
Add this entry into : /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py
#!/bin/sh
exec >/tmp/remsh.txt 2>&1
set -x
-------------------
tail /tmp/remsh.txt
n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2
Run it like this to check:
pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2
tail /tmp/remsh.debug
+ PBS_EXEC=/opt/pbs/default/bin
+ logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2'
+ /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2
ssh: illegal option -- j
usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec]
[-D [bind_address:]port] [-e escape_char] [-F configfile]
[-i identity_file] [-L [bind_address:]port:host:hostport]
[-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port]
[-R [bind_address:]port:host:hostport] [-S ctl_path]
[-w local_tun[:remote_tun]] [user@]hostname [command]
pbs_remsh n023 /bin/hostname
mpirun -r pbs_remsh hostname
mpirun -r pbs_remsh hostnam
mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221
export MPD_CON_EXT=214742.pbsserver_8221
#5: Try submitting an interactive job
qsub -I -l select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00
On the execution node:
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
NPROCS=12
module load intel-cc-11/11.1.072
module load intel-mpi/4.0.0.027
mpirun -n $NPROCS /bin/hostname
|
An example from openmpi 4
No Format |
---|
hello_mpi.c
/*The Parallel Hello World Program*/
#include <stdio.h>
#include <mpi.h>
main(int argc, char **argv)
{
int node;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &node);
printf("Hello World from Node %d\n",node);
MPI_Finalize();
}
To compile this:
module load mpi/openmpi/4.0.2
mpicc -o hello_mpi hello_mpi.c -lmpi
##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi |
No Format |
---|
#!/bin/bash
###PBS -m abe
### Mail to user
##PBS -M <yourEmail>@griffith.edu.au
### Job name
#PBS -N mpi
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=2:mpiprocs=2:mem=1g,walltime=10:00:00
## The number of chunks is given by the select =<NUM > above
CHUNKS=2
##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
### The number of MPI processes available is mpiprocs * CHUNKS
NPROCS=4
# This job's working directory
echo "Working directory is $PBS_O_WORKDIR"
cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load mpi/openmpi/4.0.2
echo "Starting job"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
#echo This jobs runs on the following processors:
echo `cat $PBS_NODEFILE`
mpirun -machinefile $PBS_NODEFILE env $NPROCS hello_mpi
echo "Done with job"
|