MPI users MUST now request mpiprocs in their resource specification

To get 32 cores running across 4 medium nodes
#PBS -l select=4:ncpus=8:NodeType=medium:mpiprocs=8

To get 24 cores running across 4 extra large nodes with only 6 MPI threads per node and without sharing the nodes with other jobs you could use
#PBS -l select=2:ncpus=12:NodeType=xl:mpiprocs=6

Intel MPI Usage

compiler to be used

binaries needs to be compiled using mpi compilers.
We have intel mpi and sgi mpi.

intel mpi
=========

####For intel mpi, use the following syntax####

#module load intel/2019up5/mpi

#mpiexec -n $PROCS $PROGRAM NAME < $INPUT FILE >& $OUTPUT FILE

module load intel-fc-11/12.0.2.137
module load intel-mpi/4.0.1.007

sgi
===
module load mpt/2.02

mpicc          mpiexec        mpiexec.py     mpif90         mpigcc         mpiicc         mpiifort       mpitune        mpivars.sh
mpicxx         mpiexec.hydra  mpif77         mpifc          mpigxx         mpiicpc        mpirun.actual  mpivars.csh

sample code

cat p1.c

/* Sum processor numbers by passing them around in a ring. */

#include "mpi.h"
#include <stdio.h>

/* Set up communication tags (these can be anything) */
#define to_right 201
#define to_left  102

void main (int argc, char *argv[]) {
  int ierror, value, new_value, procnum, numprocs;
  int right, left, other, sum, i;
  MPI_Status  recv_status;

    /* Initialize MPI */
    MPI_Init(&argc, &argv);

    /* Find out this processor number */
    MPI_Comm_rank(MPI_COMM_WORLD, &procnum);
    /* Find out the number of processors */
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

    /* Compute rank (number) of processors to the left and right */
    right = procnum + 1;
    if (right == numprocs) right = 0;
    left = procnum - 1;
    if (left == -1) left = numprocs-1;

    sum = 0;
    value = procnum;
    for( i = 0; i < numprocs; i++) {

        /* Send to the right */
        MPI_Send(&value, 1, MPI_INT, right, to_right,
                 MPI_COMM_WORLD);
        /* Receive from the left */
        MPI_Recv(&new_value, 1, MPI_INT, left, to_right,
                 MPI_COMM_WORLD, &recv_status);
    
        /* Sum the new value */
        sum = sum + new_value;
        /* Update the value to be passed */
        value = new_value;

        /* Print out the partial sums at each step */
        printf ("PE %d:\t Partial sum = %d\n", procnum, sum);
    }
    /* Print out the final result */
    if (procnum == 0) {
       printf ("Sum of all processor numbers = %d\n", sum);
    }
    /* Shut down MPI */
    MPI_Finalize();

    return;
}

compilation

Use mpiicc, mpiifort, or mpiicpc for compilation

module load intel-fc-11/12.0.2.137
module load intel-mpi/4.0.1.007
module load mpi/intel-4.0

mpicc -o p1.out p1.c -lmpi

ls /export/home/snumber/pbs/mpi
mpd.hosts p1.c p1.out

running the job as an mpi job

Manual test run

Please make sure the mpd.hosts file exist. There is a sample mpd.hosts file further down this page . It is only needed for a manual run. It need not be used when the script is run using PBS , either interactively or in batch.

The following syntax will run it on 2 nodes called by the option "-n 2")

mpirun -r ssh -f /export/home/snumber/pbs/mpi/mpd.hosts -n 2 /export/home/snumber/pbs/mpi/p1.out
PE 0:    Partial sum = 1
PE 0:    Partial sum = 1
Sum of all processor numbers = 1
PE 1:    Partial sum = 0
PE 1:    Partial sum = 1

pbs run

cat pbs.run1

#!/bin/bash -l
#PBS -m abe
#PBS -V
### Mail to user
#PBS -M YOUREMAIL@griffith.edu.au
### Job name
#PBS -N mpi_test
#PBS -l walltime=01:00:00
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=4:mpiprocs=4
NPROCS=8
source $HOME/.bashrc
##module load intel-fc-11/12.0.2.137
module load intel-mpi/4.0.0.027
##module load  intel-cc-11/11.1.072
##module load  intel-mpi/4.0.1.007
##export I_MPI_DEBUG=100
export I_MPI_PLATFORM=auto
PBS_EXEC=/opt/pbs/default
export I_MPI_MPD_RSH=ssh
echo "This jobs runs on the following processors:"
echo `cat $PBS_NODEFILE`
mpirun -n $NPROCS  /export/home/s2594054/pbs/mpi/intelMPI/p1.out 2>&1
echo "Done with job"

 qsub pbs.run1
828.pbsserver

[snumber@n027 mpi]$ qstat
Job id            Name             User              Time Use S Queue
----------------  ---------------- ----------------  -------- - -----
818.pbsserver     1ivf_apo         s2795116          00:00:01 R workq
819.pbsserver     1nn2             s2795116          00:00:01 R workq
821.pbsserver     1ivg             s2795116          00:00:00 R workq
825.pbsserver     3nss             s2795116          00:00:00 R workq
826.pbsserver     1ivf_naen        s2795116          00:00:00 R workq
828.pbsserver     mpi              s123456          00:00:00 R workq


more  mpi.o828

Starting job
PE 8:    Partial sum = 7
PE 9:    Partial sum = 8
PE 9:    Partial sum = 15
PE 14:   Partial sum = 13
PE 3:    Partial sum = 2
PE 12:   Partial sum = 11
PE 0:    Partial sum = 17
PE 16:   Partial sum = 15
PE 5:    Partial sum = 4
PE 10:   Partial sum = 9
PE 10:   Partial sum = 17
PE 10:   Partial sum = 24
PE 11:   Partial sum = 10
<snip>

Important Note

It looks like the wrapper script, mpirun, might not be working properly. mpirun wrapper script actually executes 3 other commands:
mpdboot (to start the MPD daemons), then mpiexec (to run your app), and finally mpdallexit (to close out the MPD ring). You may use the following pbs script as a workaround.

Update: This issue has been resolved by editing the mpdboot.py and pbs_remsh files and hence it is not needed. But keeping the documentation in case it gets broken after any upgrade in the future.

#PBS -m abe
#PBS -M myemail@griffith.edu.au
#PBS -N romsrun-intel
#PBS -l select=3:ncpus=4:mem=4g:mpiprocs=4
#source $HOME/.bashrc
module load intel-fc-11/12.0.2.137
module load intel-mpi/4.0.0.027
module load NetCDF/3.6.3
ROMS_DIR=/export/home/s123456/modelling/roms/forecast/work
HOME_DIR=/export/home/123456
## The number of nodes is given by the select =<NUM > above
NODES=3
#$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
# The number of MPI processes available is mpiprocs * nodes (=NPROCS)
NPROCS=12
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=5

echo "Starting job"
date
${ROMS_DIR}/ocean_eau_forecast_test.in
mpdboot  -n $NODES -f $PBS_NODEFILE -r ssh -v
mpdtrace
mpiexec -np $NPROCS      ${ROMS_DIR}/oceanG      ${ROMS_DIR}/ocean_eau_forecast_test.in
mpdallexit
date
echo "Done with job"

Another intel mpi example

Ref

This program calls MPI_Allgather to gather one int from every host:

/*
 *  * Unit test for Intel MPI failure.
 *   * mpicc -mt_mpi -o lgcGab lgcGab.c
 *    */

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

int main (int argc, char** argv)
{
  int provided;
  MPI_Comm comm;
  int rank;
  int size;
  int* sendbuf;
  int* recvbuf;

  MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided);
  if (provided<MPI_THREAD_MULTIPLE) {
    printf("Need MPI_THREAD_MULTIPLE=%d but got %d\n",
           MPI_THREAD_MULTIPLE,provided);
    MPI_Abort(MPI_COMM_WORLD,1);
  }

  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
    comm = MPI_COMM_WORLD;
      MPI_Comm_rank(comm,&rank);
       MPI_Comm_size(comm,&size);

         sendbuf = (int*)malloc(sizeof(int));
           recvbuf = (int*)malloc(size*sizeof(int));
             sendbuf[0] = rank;

              //MPI_Gather(sendbuf,1,MPI_INT, recvbuf,1,MPI_INT, 0,comm);

                MPI_Allgather(sendbuf,1,MPI_INT, recvbuf,1,MPI_INT, comm);

                  //MPI_Barrier(comm);

                    MPI_Finalize();

                      return 0;
                      }

mpicc -mt_mpi test.c -o test.o

#!/bin/bash -l
#PBS -m abe
#PBS -V
### Mail to user
####PBS -M YOUREMAIL@griffith.edu.au
### Job name
#PBS -N mpi_test
#PBS -l walltime=100:00:00
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=3:ncpus=2:mpiprocs=2
# This job's working directory
echo Working directory is $PBS_O_WORKDIR
##cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load intel-mpi/4.0.0.027
module load  intel-cc-11/11.1.072
echo "Starting job"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
export I_MPI_PLATFORM=auto
PBS_EXEC=/opt/pbs/default
export PATH=${PATH}:${PBS_EXEC}/bin
export PBS_RSHCOMMAND=/usr/bin/ssh
NPROCS=6

#sh /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin/mpivars.sh

#echo This jobs runs on the following processors:
echo `cat $PBS_NODEFILE`
mpirun -n $NPROCS  env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=100 env PATH=$PATH  env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /export/home/s123456/pbs/mpi/intelMPI/test.o 2>&1
echo "Done with job"

 mpirun -n 6  env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=100 env PATH=$PATH  env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /export/home/s123456/pbs/mpi/intelMPI/test.o
[0] MPI startup(): Intel(R) MPI Library, Version 4.0  Build 20100224
[0] MPI startup(): Copyright (C) 2003-2010 Intel Corporation.  All rights reserved.
[0] MPI startup(): Found 1 IB devices
[1] MPI startup(): Found 1 IB devices
[4] MPI startup(): Found 1 IB devices
[5] MPI startup(): Found 1 IB devices
[2] MPI startup(): Found 1 IB devices
[3] MPI startup(): Found 1 IB devices
[1] MPI startup(): Open 0 IB device: mlx4_0
[0] MPI startup(): Open 0 IB device: mlx4_0
[4] MPI startup(): Open 0 IB device: mlx4_0
[5] MPI startup(): Open 0 IB device: mlx4_0
[1] MPI startup(): Start 1 ports per adapter
[2] MPI startup(): Open 0 IB device: mlx4_0
[3] MPI startup(): Open 0 IB device: mlx4_0
[0] MPI startup(): Start 1 ports per adapter
[4] MPI startup(): Start 1 ports per adapter
[5] MPI startup(): Start 1 ports per adapter
[3] MPI startup(): Start 1 ports per adapter
[2] MPI startup(): Start 1 ports per adapter
[0] MPI startup(): shm and ofa data transfer modes
[1] MPI startup(): shm and ofa data transfer modes
[2] MPI startup(): shm and ofa data transfer modes
[3] MPI startup(): shm and ofa data transfer modes
[4] MPI startup(): shm and ofa data transfer modes
[5] MPI startup(): shm and ofa data transfer modes
[0] MPI startup(): I_MPI_DEBUG=100
[0] MPI startup(): I_MPI_FABRICS=shm:ofa
[0] MPI startup(): set domain {0,1,2,3,4,5} fails on node n006
[1] MPI startup(): set domain {0,1,2,3,4,5} fails on node n006
[3] MPI startup(): set domain to {6,7,8,9,10,11,18,19,20,21,22,23} on node n007
[2] MPI startup(): set domain to {0,1,2,3,4,5,12,13,14,15,16,17} on node n007
[5] MPI startup(): set domain to {6,7,8,9,10,11,18,19,20,21,22,23} on node n009
[4] MPI startup(): set domain to {0,1,2,3,4,5,12,13,14,15,16,17} on node n009
[0] MPI startup(): Recognition level=2. Platform code=1. Device=8
[1] MPI startup(): Recognition level=2. Platform code=1. Device=8
[1] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[3] MPI startup(): Heterogeneous cluster. Generic defauts are selected
[3] MPI startup(): Recognition level=2. Platform code=1. Device=8
[3] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[2] MPI startup(): Heterogeneous cluster. Generic defauts are selected
[2] MPI startup(): Recognition level=2. Platform code=1. Device=8
[2] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[0] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[0] Allgather: 1: 0-512 & 3-16
[4] MPI startup(): Heterogeneous cluster. Generic defauts are selected
[4] MPI startup(): Recognition level=2. Platform code=1. Device=8
[4] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[0] Allgather: 1: 0-2048 & 9-16
[5] MPI startup(): Heterogeneous cluster. Generic defauts are selected
[5] MPI startup(): Recognition level=2. Platform code=1. Device=8
[5] MPI startup(): Parent configuration:(intra=1 inter=2 flags=0), (code=1 ppn=1)
[0] Allgather: 1: 0-512 & 33-2147483647
[0] Allgather: 2: 0-1024 & 17-32
[0] Allgather: 3: 0-2147483647 & 0-2147483647
[0] Allgatherv: 0: 0-2147483647 & 0-2147483647
[0] Allreduce: 1: 0-1024 & 0-2147483647
[0] Allreduce: 1: 0-4096 & 0-4
[0] Allreduce: 1: 0-2048 & 0-8
[0] Allreduce: 1: 0-16384 & 33-2147483647
[0] Allreduce: 2: 1025-2097152 & 3-32
[0] Allreduce: 4: 16385-524288 & 33-2147483647
[0] Allreduce: 3: 0-2147483647 & 0-2147483647
[0] Alltoall: 1: 0-32 & 17-2147483647
[0] Alltoall: 1: 0-128 & 33-2147483647
[0] Alltoall: 2: 0-32768 & 3-32
[0] Alltoall: 2: 32768-2147483647 & 3-4
[0] Alltoall: 3: 0-262144 & 0-2
[0] Alltoall: 3: 513-2147483647 & 33-2147483647
[0] Alltoall: 4: 0-2147483647 & 0-2147483647
[0] Alltoallv: 1: 0-2147483647 & 0-32
[0] Alltoallv: 2: 0-2147483647 & 0-2147483647
[0] Alltoallw: 0: 0-2147483647 & 0-2147483647
[0] Barrier: 2: 0-2147483647 & 0-32
[0] Barrier: 4: 0-2147483647 & 0-2147483647
[0] Bcast: 1: 0-1024 & 0-8
[0] Bcast: 1: 0-8192 & 3-4
[0] Bcast: 7: 0-2147483647 & 0-2147483647
[0] Exscan: 0: 0-2147483647 & 0-2147483647
[0] Gather: 1: 0-64 & 0-2
[0] Gather: 3: 0-2147483647 & 0-2147483647
[0] Gatherv: 1: 0-2147483647 & 0-2147483647
[0] Reduce_scatter: 5: 17-32 & 9-16
[0] Reduce_scatter: 5: 17-64 & 17-32
[0] Reduce_scatter: 5: 0-128 & 33-2147483647
[0] Reduce_scatter: 1: 0-32768 & 3-2147483647
[0] Reduce_scatter: 2: 0-2147483647 & 0-2147483647
[0] Reduce: 3: 524289-2097152 & 0-2
[0] Reduce: 3: 129-8192 & 3-4
[0] Reduce: 1: 0-2147483647 & 0-2147483647
[0] Scan: 0: 0-2147483647 & 0-2147483647
[0] Scatter: 1: 0-2048 & 0-2
[0] Scatter: 3: 0-2147483647 & 0-2147483647
[0] Scatterv: 1: 0-2147483647 & 0-2
[0] Scatterv: 2: 0-2147483647 & 0-2147483647
[0] Rank    Pid      Node name  Pin cpu
[0] 0       12100    n006       n/a
[0] 1       12099    n006       n/a
[0] 2       32239    n007       {0,1,2,3,4,5,12,13,14,15,16,17}
[0] 3       32240    n007       {6,7,8,9,10,11,18,19,20,21,22,23}
[0] 4       3951     n009       {0,1,2,3,4,5,12,13,14,15,16,17}
[0] 5       3952     n009       {6,7,8,9,10,11,18,19,20,21,22,23}

benchmarking intel mpi

module load intel-mpi/4.0.0.027
module load  intel-cc-11/11.1.072
NPROCS=4
mpirun -n $NPROCS  env I_MPI_MPD_RSH=ssh env I_MPI_DEBUG=5 env PATH=$PATH  env LD_LIBRARY_PATH=$LD_LIBRARY_PATH /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/IMB-MPI1

openMPI usage

Please note that the syntax for openmpi's mpirun is a little different from intel-mpi's mpirun. Please see a samplle below.

module load mpi/openMPI/1.4.3-gnu

###For openmpi, use the following syntax####
#module load mpi/openmpi/4.0.2
#mpiexec $PROGRAM NAME < $INPUT FILE >& $OUTPUT FILE

Sample run

openMPI has a different mechanism to pass the local environmental variables from master node to slave nodes. You have three options:

1. source ~/.bashrc
2. Use mpirun's --prefix command line option (described below).
3. Modify the wrapper compilers to include directives to include run-time search locations for the Open MPI libraries (see this FAQ entry)

mpirun's --prefix command line option takes as an argument the top-level directory where Open MPI was installed. While relative directory names are possible, they can become ambiguous depending on the job launcher used; using absolute directory names are strongly recommended.

For example, say that Open MPI was installed into /sw/openMPI/1.4.3-gnu/. You would use the --prefix option like this:

shell$ mpirun --prefix /sw/openMPI/1.4.3-gnu/ -np 4 a.out

This will prefix the PATH and LD_LIBRARY_PATH on both the local and remote hosts with /sw/openMPI/1.4.3-gnu/bin and /sw/openMPI/1.4.3-gnu/lib, respectively.

Beginning with the 1.2 series, it is possible to make this the default behavior by passing to configure the flag --enable-mpirun-prefix-by-default. This will make mpirun behave exactly the same as "mpirun --prefix $prefix ...", where $prefix is the value given to --prefix in configure.

Finally, note that specifying the absolute pathname to mpirun is equivalent to using the --prefix argument. For example, the following is equivalent to the above command line that uses --prefix:

/sw/openMPI/1.4.3-gnu/bin/mpirun -np 4 a.out

#PBS -m e
#PBS -M <YourEmail>@griffith.edu.au
#PBS -N Test
#PBS -l select=6:ncpus=4:mem=12g:mpiprocs=4
source $HOME/.bashrc
module load swan/mpi/4085-gnu
## The number of nodes is given by the select =<NUM > above
NODES=6

###$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
###### The number of MPI processes available is mpiprocs * nodes (=NPROCS)
NPROCS=24

echo "Starting job"

/sw/openMPI/1.4.3-gnu/bin/mpirun -machinefile $PBS_NODEFILE -np $NPROCS  env PATH=$PATH  env LD_LIBRARY_PATH=$LD_LIBRARY_PATH swanrun -input ${SWAN_FORECAST_WORK}/swan_eau_forecast.swn
#
echo "Done with job"

Another example with openmpi 4.0.2

#!/bin/bash 
#PBS -m abe
###PBS -M EMail@griffith.edu.au
#PBS -N mpiTest
#PBS -q legacyq
#PBS -l select=1:ncpus=16:mpiprocs=16,walltime=00:10:00
##source $HOME/.bashrc
module load mpi/openmpi/4.0.2
#module load mpi/openmpi/5.0.7
echo $PBS_NODEFILE
##cat $PBS_NODEFILE
NP=`wc -l < $PBS_NODEFILE`
echo $NP
echo "Starting job"
mpirun -hostfile $PBS_NODEFILE -np $NP /export/home/s12345/pbs/mpi/openmpi/a.out
echo "Done with job"

mpd.host file

All compute Nodes

cat mpd.hosts
n001
n002
n003
n004
n005
n006
n007
n008
n009
n010
n011
n012
n013
n014
n015
n016
n017
n018
n019
n020
n021
n022
n023

GPU nodes

cat mpd.hosts
n020
n021
n022
n023

SGI mpi - SGI-mpt

Sample code

cat pingpong.c

#include <stdio.h>
#include "mpi.h"

#define ONE_MIL      (1000000)
#define MAX_SIZE_LOG (21)
#define MAX_SIZE     (1<<MAX_SIZE_LOG)

char sdata[MAX_SIZE];
char rdata[MAX_SIZE];

main( int argc, char **argv )
{
    int              rank, i, size, j, next, prev;
    long             rounds, msglen;
    MPI_Status       status;
    double           startt,endt;
    char             buf[128];

    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    if (rank == 0) {
        printf("STARTING LATENCY AND BANDWIDTH BENCHMARK\n");
        fflush(stdout);
    }

    next = (rank + 1 ) % size;
    prev = (rank + size - 1 ) % size;
    msglen = 0;

    while (msglen <= MAX_SIZE) {

        if (msglen <= (1<<9))       rounds = 1 << 15;
        else if (msglen <= (1<<12)) rounds = 1 << 12;
        else if (msglen <= (1<<18)) rounds = 1 << 10;
        else                        rounds = 1 << 8;

        MPI_Barrier ( MPI_COMM_WORLD );

        startt = MPI_Wtime();

        for (j=rounds; j--; ) {
            if (rank == 0) {
                MPI_Send(sdata,msglen,MPI_BYTE,next,0,MPI_COMM_WORLD);
                MPI_Recv(rdata,msglen,MPI_BYTE,prev,0,MPI_COMM_WORLD,&status);
            } else {
                MPI_Recv(rdata,msglen,MPI_BYTE,prev,0,MPI_COMM_WORLD,&status);
                MPI_Send(sdata,msglen,MPI_BYTE,next,0,MPI_COMM_WORLD);
            }
        }
        if (rank == 0) {
            double latency,bandwidth, elapse;

            endt = MPI_Wtime();
            elapse = endt - startt;

            latency   = elapse / (double)(size*rounds) * (double)ONE_MIL;
            bandwidth = ((double)(size*rounds*msglen))/elapse/(double)ONE_MIL;
            printf("%3d %8.3f %8.3f\n",(int)msglen,latency,bandwidth);
            fflush(stdout);
        }
        if (msglen == 0) msglen = 1;
        else msglen = 2 * msglen;
    }
    MPI_Finalize();
}

pbs

module avail
module load mpt

module list
Currently Loaded Modulefiles:
  1) mpt/2.04

Compile the program

gcc -o pingpong_mpt pingpong.c -lmpi

sample run - masternode

mpirun 2 ./pingpong_mpt


cat mpd.hosts
n005
n006

 mpirun -v n005,n006 2 ./pingpong_mpt

mpirun -v n003,n004 2 ./pingpong_mpt
MPI: libxmpi.so 'SGI MPT 2.04  03/15/11 05:36:27'
MPI: libmpi.so  'SGI MPT 2.04  03/15/11 05:34:18'
MPI: libmpi.so  'SGI MPT 2.04  03/15/11 05:34:18'
MPI: Cluster collective optimizations enabled.
	MPI Environmental Settings
MPI: MPI_VERBOSE (default: 0) : 1
MPI: Using the InfiniBand interconnect.
STARTING LATENCY AND BANDWIDTH BENCHMARK
  0    3.770    0.000
  1    3.147    0.318
  2    3.120    0.641
  4    3.126    1.279
  8    3.141    2.547
 16    3.185    5.024
 32    3.213    9.959
 64    3.603   17.765
128    3.891   32.896
256    4.243   60.342
512    4.937  103.709

More Examples

hello_mpi

cat hello_mpi.c

/*The Parallel Hello World Program*/
#include <stdio.h>
#include <mpi.h>

main(int argc, char **argv)
{
   int node;
   
   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD, &node);
     
   printf("Hello World from Node %d\n",node);
            
   MPI_Finalize();
}

To compile this:

module load intel-fc-11/12.0.2.137
module load  intel-mpi/4.0.1.007
module load mpi/intel-4.0

mpicc -o hello_mpi hello_mpi.c -lmpi

Sample PBS script
cat pbs.run7

#!/bin/bash -l
#PBS -m abe
### Mail to user
#PBS -M <yourEmail>@griffith.edu.au
### Job name
#PBS -N mpi
#PBS -l walltime=60:00:00
## Please note the walltime above . This value * must * be set so that if the
## MPI program runs in an infinite loop , or something similar , it will be
## killed after the given wall time .
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=12:mpiprocs=7
## The number of nodes is given by the select =<NUM > above
NODES=2
##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
### The number of MPI processes available is mpiprocs * nodes
NPROCS=14
# This job's working directory
echo "Working directory is $PBS_O_WORKDIR"
cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load intel-fc-11/12.0.2.137
module load  intel-mpi/4.0.1.007
module load mpi/intel-4.0
echo "Starting job"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
#echo This jobs runs on the following processors:
echo `cat $PBS_NODEFILE`
mpirun -f $PBS_NODEFILE -n "$NODES" -r ssh -n "$NPROCS"    /export/home/snumber/pbs/mpi/2/hello_mpi
echo "Done with job"

qsub  pbs.run7
2659.pbsserver

qstat
2659.pbsserver    mpi              s123456          00:00:00 R workq

cat  mpi.o2659
Working directory is /export/home/SNUMBER/pbs/mpi/2
Starting job
Running on host n010
Time is Wed Jul 27 08:37:14 EST 2011
Directory is /export/home/SNUMBER/pbs/mpi/2
n010 n010 n010 n010 n010 n010 n010 n020 n020 n020 n020 n020 n020 n020
Hello World from Node 0
Hello World from Node 2
Hello World from Node 4
Hello World from Node 6
Hello World from Node 3
Hello World from Node 1
Hello World from Node 5
Hello World from Node 9
Hello World from Node 11
Hello World from Node 7
Hello World from Node 13
Hello World from Node 8
Hello World from Node 10
Hello World from Node 12
Done with job

qsub -I pbs.run7
cd $PBS_O_WORKDIR
module load intel-fc-11/12.0.2.137
module load  intel-mpi/4.0.1.007
module load mpi/intel-4.0

mpirun -f $PBS_NODEFILE -n 2 -r ssh -n 14    /export/home/SNUMBER/pbs/mpi/2/hello_mpi
Hello World from Node 0
Hello World from Node 2
Hello World from Node 1
Hello World from Node 4
Hello World from Node 6
Hello World from Node 3
Hello World from Node 8
Hello World from Node 5
Hello World from Node 13
Hello World from Node 7
Hello World from Node 12
Hello World from Node 10
Hello World from Node 11
Hello World from Node 9

OR

 mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np $(wc -l  $PBS_NODEFILE | gawk '{print $1}')  /export/home/snumber/pbs/mpi/2/hello_mpi
running mpdallexit on n016
LAUNCHED mpd on n016  via
RUNNING: mpd on n016
LAUNCHED mpd on n020  via  n016
RUNNING: mpd on n020
3: Hello World from Node 3
2: Hello World from Node 2
0: Hello World from Node 0
6: Hello World from Node 6
4: Hello World from Node 4
1: Hello World from Node 1
5: Hello World from Node 5
7: Hello World from Node 7
11: Hello World from Node 11
12: Hello World from Node 12
10: Hello World from Node 10
13: Hello World from Node 13
8: Hello World from Node 8
9: Hello World from Node 9

The following could work as well

mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np 8 /export/home/SNUMBER/pbs/mpi/p1.out

mpirun -r ssh -f $PBS_NODEFILE --totalnum=$NPROCS --verbose -l -machinefile $PBS_NODEFILE -np $(wc -l  $PBS_NODEFILE | gawk '{print $1}') /export/home/SNUMBER/pbs/mpi/p1.out

--totalnum specifies the total number of mpds to start
    -np number - number of processes
      -n <n> or -np <n>            # number of processes to start

mpirun -f $PBS_NODEFILE -n  $(cat  $PBS_NODEFILE | gawk '{print $1}'|sort|uniq|wc -l) -r ssh -n  $(wc -l  $PBS_NODEFILE | gawk '{print $1}') /export/home/SNUMBER/pbs/mpi/p1.out

To check if mpd is working well:

mpdcheck -f $PBS_NODEFILE -v

PBS select line

#PBS -l select=2:ncpus=8:mpiprocs=8

The line "-l select=2:ncpus=8:mpiprocs=8" is the number of processors required for the mpi job. "select" specifies the number of nodes required; "ncpus" indicates the number of CPUs per node required; and "mpiprocs" represents the number of mpi processes to run per node (normally ncpus=mpiprocs).

As this is not the most intuitive command, the following table is provided as guidance as to how this command works:

select	ncpus	mpiprocs	description
4	8	8	32 Processor job, using 4 nodes and 8 processors per node
4	4	4	16 Processor job, using 4 nodes and 4 processors per node
16	1	1	16 Processor job, using 16 nodes running 1 mpi process per processor and utilising 1 processor per node
16	8	8	128 Processor job, using 16 nodes and 8 processors per node (each running an mpi process)

Ref

1. http://www.hpcu.uq.edu.au/hpc/content/view/225/34/
2. http://www.oerc.ox.ac.uk/computing-resources/osc/support/documentation-help/job-schedulers/pbs/pbs-job-submission-scripts
3. http://www.cardiff.ac.uk/arcca/services/equipment/User-Guide/user-guide.html

PS: tight integration of intelMPI and PBS

For tight integration of intel MPI and PBS, the following needed to be done:

#1 ==> On all execution nodes, add the following:

-->  vi /opt/pbs/default/bin/pbs_remsh

while [ $# -gt 1 ]; do
        if [ "XX$1" = "XX-j" ]; then
                shift;
                jobid=$1
                shift;
        elif [ "XX$1" = "XX-r" ]; then
                shift;
                rshcmd=$1
                shift;
#Add the following 2 lines here.
        elif [ "XX$1" = "XX-n" ]; then
                shift;
        else
                break;
        fi
done

#2 ==> vi  /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py

(search for ssh and find an entry similar to this)

 else:
        if rshCmd == 'ssh':
            #rshArgs = '-x -n -q'
            #Added the following entry 
            rshArgs = '-x -n -q'
        elif rshCmd == 'pbs_tmrsh':
            rshArgs = ''
        elif rshCmd == 'pbs_remsh':
            rshArgs = ''
        else:
            rshArgs = ''
        mpdHost = hostsAndInfo[idxToStart]['host']


#3 Sample PBS script

>>>>>>>>>>>>>>>>>>>>>>>
#!/bin/bash -l
#PBS -m abe
#PBS -V
#PBS -M YOUREMAIL@griffith.edu.au
Job name
#PBS -N Intel_mpi_test
#PBS -l walltime=100:00:00
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=4:mpiprocs=4
NPROCS=8
#cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load intel-mpi/4.0.0.027
module load  intel-cc-11/11.1.072
##export I_MPI_MPD_RSH=pbs_remsh
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
export I_MPI_PLATFORM=auto
PBS_EXEC=/opt/pbs/default
export PATH=${PATH}:${PBS_EXEC}/bin
mpirun -n $NPROCS /bin/hostname  2>&1

>>>>>>>>>>>>>>>>>>>>>>>

# 4 : Further torubleshooting needed?
If further troubleshooting is needed, add the following entry to /opt/pbs/default/bin/pbs_remsh
-----At the beginning --------
#!/bin/sh
exec >/tmp/remsh.debug 2>&1
set -x
---------------------

At the very bottom, add the following

logger "pbs_remsh cooked options: $remsh $host pbs_attach -j $PBS_JOBID $*"
$remsh "$host" ${PBS_EXEC}/pbs_attach -j "$PBS_JOBID" $*

----------------------

Look at the logs /tmp/remsh.debug and tail -f /var/log/messages to identify issues

Add this entry into :  /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpdboot.py

#!/bin/sh
exec >/tmp/remsh.txt 2>&1
set -x

-------------------

tail /tmp/remsh.txt
n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2


Run it like this to check:

 pbs_remsh n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_6956 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 52589 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2



tail /tmp/remsh.debug
+ PBS_EXEC=/opt/pbs/default/bin
+ logger 'pbs_remsh cooked options: /usr/bin/ssh -n -n pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2'
+ /usr/bin/ssh -n -n /opt/pbs/default/bin/pbs_attach -j 214742.pbsserver n023 env I_MPI_JOB_TAGGED_PORT_OUTPUT=1 HOSTNAME=n022 MPD_CON_EXT=214742.pbsserver_9340 TMPDIR=/scratch/pbs.214742.pbsserver /sw/sdev/intel/mpi/4.0.0.027/x86_64/intel64/bin/mpd.py -h n022 -p 35698 --ifhn=10.110.2.123 --ncpus=1 --myhost=n023 --myip=10.110.2.123 -e -d -s 2
ssh: illegal option -- j
usage: ssh [-1246AaCfgKkMNnqsTtVvXxYy] [-b bind_address] [-c cipher_spec]
           [-D [bind_address:]port] [-e escape_char] [-F configfile]
           [-i identity_file] [-L [bind_address:]port:host:hostport]
           [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port]
           [-R [bind_address:]port:host:hostport] [-S ctl_path]
           [-w local_tun[:remote_tun]] [user@]hostname [command]

pbs_remsh n023 /bin/hostname
mpirun -r pbs_remsh hostname
mpirun -r pbs_remsh hostnam
mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -v -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
mpdboot -n 2 -f /var/spool/PBS/aux/214742.pbsserver --rsh=pbs_remsh
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostname
sh -x /sw/sdev/intel/mpi/4.0.0.027/x86_64/bin64/mpirun -r pbs_remsh hostnameexport MPD_CON_EXT=214742.pbsserver_8221
export MPD_CON_EXT=214742.pbsserver_8221

#5: Try submitting an interactive job

qsub -I -l  select=3:ncpus=4:mpiprocs=4 -l walltime=100:00:00

On the execution node:
export I_MPI_MPD_RSH=ssh
export I_MPI_DEBUG=100
NPROCS=12
module load  intel-cc-11/11.1.072
module load intel-mpi/4.0.0.027
mpirun -n $NPROCS /bin/hostname

An example from openmpi 4

hello_mpi.c

/*The Parallel Hello World Program*/
#include <stdio.h>
#include <mpi.h>

main(int argc, char **argv)
{
   int node;
   
   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD, &node);
     
   printf("Hello World from Node %d\n",node);
            
   MPI_Finalize();
}

To compile this:
module load mpi/openmpi/4.0.2

mpicc -o hello_mpi hello_mpi.c -lmpi


##mpirun -machinefile $PBS_NODEFILE -np $NPROCS hello_mpi

#!/bin/bash 
###PBS -m abe
### Mail to user
##PBS -M <yourEmail>@griffith.edu.au
### Job name
#PBS -N mpi
### Number of nodes:Number of CPUs:Number of threads per node
#PBS -l select=2:ncpus=2:mpiprocs=2:mem=1g,walltime=10:00:00
## The number of chunks is given by the select =<NUM > above
CHUNKS=2
##$PBS_NODEFILE is a node-list file created with select and mpiprocs options by PBS
### The number of MPI processes available is mpiprocs * CHUNKS
NPROCS=4
# This job's working directory
echo "Working directory is $PBS_O_WORKDIR"
cd $PBS_O_WORKDIR
source $HOME/.bashrc
module load  mpi/openmpi/4.0.2
echo "Starting job"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
#echo This jobs runs on the following processors:
echo `cat $PBS_NODEFILE`
mpirun -machinefile $PBS_NODEFILE env $NPROCS  hello_mpi
echo "Done with job"