...
No Format |
---|
#1 ==> On all execution nodes, add the following: --> vi /opt/pbs/default/bin/pbs_remsh while [ $# -gt 1 ]; do if [ "XX$1" = "XX-j" ]; then shift; jobid=$1 shift; elif [ "XX$1" = "XX-r" ]; then shift; rshcmd=$1 shift; #Add the following 2 lines here. elif [ "XX$1" = "XX-n" ]; then shift; else break; fi done #2 ==> vi /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py (search for ssh and find an entry similar to this) else: if rshCmd == 'ssh': #rshArgs = '-x -n -q' #Added the following entry rshArgs = '-x -n -q' elif rshCmd == 'pbs_tmrsh': rshArgs = '' elif rshCmd == 'pbs_remsh': rshArgs = '' else: rshArgs = '' mpdHost = hostsAndInfo[idxToStart]['host'] #3 Sample PBS script >>>>>>>>>>>>>>>>>>>>>>> #!/bin/bash -l #PBS -m abe #PBS -V #PBS -M YOUREMAIL@griffith.edu.au Job name #PBS -N Intel_mpi_test #PBS -l walltime=100:00:00 ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=4:mpiprocs=4 NPROCS=8 #cd $PBS_O_WORKDIR source $HOME/.bashrc module load intel-mpi/4.0.0.027 module load intel-cc-11/11.1.072 ##export I_MPI_MPD_RSH=pbs_remsh export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 export I_MPI_PLATFORM=auto PBS_EXEC=/opt/pbs/default export PATH=${PATH}:${PBS_EXEC}/bin mpirun -n $NPROCS /bin/hostname 2>&1 >>>>>>>>>>>>>>>>>>>>>>> # 4 : Further torubleshooting needed? If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh -----At the beginning -------- #!/bin/sh exec >/tmp/remsh.debug 2>&1 set -x --------------------- At the very bottom, add the following logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*" $remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $* ---------------------- Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues Add this entry into : /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py #!/bin/sh exec >/tmp/remsh.txt 2>&1 set -x ------------------- tail /tmp/remsh.txt n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 Run it like this to check: pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 tail /tmp/remsh.debug + PBS_EXEC=/opt/pbs/default/bin + logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2' + /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 ssh: illegal option -- j usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec] [-D [bind_address:]port] [-e escape_char] [-F configfile] [-i identity_file] [-L [bind_address:]port:host:hostport] [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port] [-R [bind_address:]port:host:hostport] [-S ctl_path] [-w local_tun[:remote_tun]] [user@]hostname [command] pbs_remsh n023 /bin/hostname mpirun -r pbs_remsh hostname mpirun -r pbs_remsh hostnam mpirun -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221 export MPD_CON_EXT=214742.pbsserver_8221 #5: Try submitting an interactive job qsub -I -l select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00 On the execution node: export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 NPROCS=12 module load intel-cc-11/11.1.072 module load intel-mpi/4.0.0.027 mpirun -n $NPROCS /bin/hostname |
An example from openmpi 4
No Format |
---|
hello_mpi.c
/*The Parallel Hello World Program*/
#include <stdio.h>
#include <mpi.h>
main(int argc, char **argv)
{
int node;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &node);
printf("Hello World from Node %d\n",node);
MPI_Finalize();
}
To compile this:
module load mpi/openmpi/4.0.2
mpicc -o hello_mpi hello_mpi.c -lmpi
##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi |
No Format |
---|
#!/bin/bash
###PBS -m abe
### Mail to user
##PBS -M <yourEmail>@griffith.edu.au
### Job name
#PBS -N mpi
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=2:mpiprocs=2:mem=1g,walltime=10:00:00
## The number of chunks is given by the select =<NUM > above
CHUNKS=2
##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
### The number of MPI processes available is mpiprocs * CHUNKS
NPROCS=4
# This job's working directory
echo "Working directory is $PBS_O_WORKDIR"
cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load mpi/openmpi/4.0.2
echo "Starting job"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
#echo This jobs runs on the following processors:
echo `cat $PBS_NODEFILE`
mpirun -machinefile $PBS_NODEFILE env $NPROCS hello_mpi
echo "Done with job"
|