Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

No Format
#1 ==> On all execution nodes, add the following:

-->  vi /opt/pbs/default/bin/pbs_remsh

while [ $# -gt 1 ]; do
        if [ "XX$1" = "XX-j" ]; then
                shift;
                jobid=$1
                shift;
        elif [ "XX$1" = "XX-r" ]; then
                shift;
                rshcmd=$1
                shift;
#Add the following 2 lines here.
        elif [ "XX$1" = "XX-n" ]; then
                shift;
        else
                break;
        fi
done

#2 ==> vi  /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py

(search for ssh and find an entry similar to this)

 else:
        if rshCmd == 'ssh':
            #rshArgs = '-x -n -q'
            #Added the following entry 
            rshArgs = '-x -n -q'
        elif rshCmd == 'pbs_tmrsh':
            rshArgs = ''
        elif rshCmd == 'pbs_remsh':
            rshArgs = ''
        else:
            rshArgs = ''
        mpdHost = hostsAndInfo[idxToStart]['host']


#3 Sample PBS script

>>>>>>>>>>>>>>>>>>>>>>>
#!/bin/bash -l
#PBS -m abe
#PBS -V
#PBS -M YOUREMAIL@griffith.edu.au
Job name
#PBS -N Intel_mpi_test
#PBS -l walltime=100:00:00
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=4:mpiprocs=4
NPROCS=8
#cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load intel-mpi/4.0.0.027
module load  intel-cc-11/11.1.072
##export I_MPI_MPD_RSH=pbs_remsh
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
export I_MPI_PLATFORM=auto
PBS_EXEC=/opt/pbs/default
export PATH=${PATH}:${PBS_EXEC}/bin
mpirun -n $NPROCS /bin/hostname  2>&1

>>>>>>>>>>>>>>>>>>>>>>>

# 4 : Further torubleshooting needed?
If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh
-----At the beginning --------
#!/bin/sh
exec >/tmp/remsh.debug 2>&1
set -x
---------------------

At the very bottom, add the following

logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*"
$remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $*

----------------------

Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues

Add this entry into :  /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py

#!/bin/sh
exec >/tmp/remsh.txt 2>&1
set -x

-------------------

tail /tmp/remsh.txt
n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2


Run it like this to check:

 pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2



tail /tmp/remsh.debug
+ PBS_EXEC=/opt/pbs/default/bin
+ logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2'
+ /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2
ssh: illegal option -- j
usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec]
           [-D [bind_address:]port] [-e escape_char] [-F configfile]
           [-i identity_file] [-L [bind_address:]port:host:hostport]
           [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port]
           [-R [bind_address:]port:host:hostport] [-S ctl_path]
           [-w local_tun[:remote_tun]] [user@]hostname [command]

pbs_remsh n023 /bin/hostname
mpirun -r pbs_remsh hostname
mpirun -r pbs_remsh hostnam
mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221
export MPD_CON_EXT=214742.pbsserver_8221

#5: Try submitting an interactive job

qsub -I -l  select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00

On the execution node:
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
NPROCS=12
module load  intel-cc-11/11.1.072
module load intel-mpi/4.0.0.027
mpirun -n $NPROCS /bin/hostname

An example from openmpi 4

No Format
hello_mpi.c

/*The Parallel Hello World Program*/
#include <stdio.h>
#include <mpi.h>

main(int argc, char **argv)
{
   int node;
   
   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD, &node);
     
   printf("Hello World from Node %d\n",node);
            
   MPI_Finalize();
}

To compile this:
module load mpi/openmpi/4.0.2

mpicc -o hello_mpi hello_mpi.c -lmpi


##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi

...