...
No Format |
---|
#1 ==> On all execution nodes, add the following: --> vi /opt/pbs/default/bin/pbs_remsh while [ $# -gt 1 ]; do if [ "XX$1" = "XX-j" ]; then shift; jobid=$1 shift; elif [ "XX$1" = "XX-r" ]; then shift; rshcmd=$1 shift; #Add the following 2 lines here. elif [ "XX$1" = "XX-n" ]; then shift; else break; fi done #2 ==> vi /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py (search for ssh and find an entry similar to this) else: if rshCmd == 'ssh': #rshArgs = '-x -n -q' #Added the following entry rshArgs = '-x -n -q' elif rshCmd == 'pbs_tmrsh': rshArgs = '' elif rshCmd == 'pbs_remsh': rshArgs = '' else: rshArgs = '' mpdHost = hostsAndInfo[idxToStart]['host'] #3 Sample PBS script >>>>>>>>>>>>>>>>>>>>>>> #!/bin/bash -l #PBS -m abe #PBS -V #PBS -M YOUREMAIL@griffith.edu.au Job name #PBS -N Intel_mpi_test #PBS -l walltime=100:00:00 ### Number of nodes:Number of CPUs:Number of threads per node #PBS -l select=2:ncpus=4:mpiprocs=4 NPROCS=8 #cd $PBS_O_WORKDIR source $HOME/.bashrc module load intel-mpi/4.0.0.027 module load intel-cc-11/11.1.072 ##export I_MPI_MPD_RSH=pbs_remsh export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 export I_MPI_PLATFORM=auto PBS_EXEC=/opt/pbs/default export PATH=${PATH}:${PBS_EXEC}/bin mpirun -n $NPROCS /bin/hostname 2>&1 >>>>>>>>>>>>>>>>>>>>>>> # 4 : Further torubleshooting needed? If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh -----At the beginning -------- #!/bin/sh exec >/tmp/remsh.debug 2>&1 set -x --------------------- At the very bottom, add the following logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*" $remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $* ---------------------- Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues Add this entry into : /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py #!/bin/sh exec >/tmp/remsh.txt 2>&1 set -x ------------------- tail /tmp/remsh.txt n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 Run it like this to check: pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 tail /tmp/remsh.debug + PBS_EXEC=/opt/pbs/default/bin + logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2' + /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2 ssh: illegal option -- j usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec] [-D [bind_address:]port] [-e escape_char] [-F configfile] [-i identity_file] [-L [bind_address:]port:host:hostport] [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port] [-R [bind_address:]port:host:hostport] [-S ctl_path] [-w local_tun[:remote_tun]] [user@]hostname [command] pbs_remsh n023 /bin/hostname mpirun -r pbs_remsh hostname mpirun -r pbs_remsh hostnam mpirun -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221 export MPD_CON_EXT=214742.pbsserver_8221 #5: Try submitting an interactive job qsub -I -l select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00 On the execution node: export I_MPI_MPD_RSH=ssh export I_MPI_DEBUG=100 NPROCS=12 module load intel-cc-11/11.1.072 module load intel-mpi/4.0.0.027 mpirun -n $NPROCS /bin/hostname |
An example from openmpi 4
No Format |
---|
hello_mpi.c /*The Parallel Hello World Program*/ #include <stdio.h> #include <mpi.h> main(int argc, char **argv) { int node; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &node); printf("Hello World from Node %d\n",node); MPI_Finalize(); } To compile this: module load mpi/openmpi/4.0.2 mpicc -o hello_mpi hello_mpi.c -lmpi ##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi |
...