#include <omp.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// a very simple fib number generator using openmp
// it is not the best way to compute fib, this is just an example
int simple_fib(int n)
{
int i, j;
if (n<2)
return n;
else {
#pragma omp task shared(i)
i = simple_fib(n-1);
#pragma omp task shared(j)
j = simple_fib(n-2);
#pragma omp taskwait
return i+j;
}
}
// a simple demonstration of threading from OpenMP, useful to understand
// Slurm and mpirun submission options
void greeting(const char* host)
{
int thread_rank;
#pragma omp parallel private(thread_rank)
{
thread_rank = omp_get_thread_num();
if (thread_rank == 0)
printf("on host %s %d threads will be launched\n", host, omp_get_num_threads());
printf(" Hey from thread %d on %s\n", thread_rank, host);
}
}
int main (int argc, char *argv[])
{
char host[1024];
int rank, procs, host_len;
int fib;
int* fibs;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &procs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(host, &host_len);
greeting(host);
// demonstrate simple MPI communication, computes a fib number in each process
// and send all results to rank 0 for display
fib = simple_fib(rank + 10);
printf("on host %s, compute fib of %d and it is %d\n", host, rank + 10, fib);
if (rank == 0) {
fibs = malloc(sizeof(int) * procs);
}
MPI_Gather(&fib, 1, MPI_INT, fibs, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == 0) {
for (int i = 0; i < procs; i++) {
printf("* from rank %d, gathered FIB number %d \n", i, fibs[i]);
}
}
MPI_Finalize();
}
You should have the same environment as the simple MPI Example.
$ module list
Currently Loaded Modules:
1) slurm/17.02 2) gcc/4 3) openmpi/2.0
You can still compile it manually:
$ mpicc -fopenmp mpi_omp.c -o mpi_omp
The flag -fopenmp
instructs the compiler to compile code into an OpenMP program.
Please do not forget it since the compilation may still succeed but the compiler may ignore all OpenMP directives.
For a different compiler, you will also have to give different options to enable OpenMP. Here is a list of what you may see on Turing.
compiler | options |
---|---|
GCC | -fopenmp |
LLVM | -fopenmp |
ICC | -qopenmp |
PGI | -mp |
For a larger project, we still recommend using a Makefile to manage your build process.
EXT = c
SRCS = $(shell find src -name '*.$(EXT)')
OBJS = $(SRCS:src/%.$(EXT)=build/%.o)
BIN = mpi_omp
CC = mpicc
LD = mpicc
CFLAGS = -fopenmp -O2
LDFLAGS = -fopenmp
all: $(BIN)
$(BIN): $(OBJS)
$(LD) $(LDFLAGS) $(OBJS) -o $(BIN)
build/%.o: src/%.$(EXT)
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm build/*.o
rm $(BIN)
The only change to this file from the simple MPI example is adding -fopenmp
to the compiler flags and linker flags.
Below is a list of changes to the submission script that is worth noticing. This submission works for a MPI + multithreaded program
in general, such as MPI+OpenACC
, MPI+Pthread
CPUs Per Task --cpus-per-task
A openmp program requires a single process but multiple threads. This configuration can be delivered by using the --cpus-per-task
option to Slurm.
Environment Variable OMP_NUM_THREADS
OpenMP uses a number of environment variables to control its behavior. OMP_NUM_THREADS
is, perhaps, the most important one as it tells your application how many threads it can have. There are many more. You can find detailed information here.
#!/bin/bash
#SBATCH --job-name=simple_mpi_omp
#SBATCH --output=output
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=10
enable_lmod
module load gcc/4
module load openmpi/2.0
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
srun mpi_omp
Slurm will manage affinity on the user's behalf. It will not allow users to use more cpu resources than the job being scheduled. For instance:
configuration below launches 2 processes and each process is given access to 10 cores
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=10
If user launches 10
thread per process (OMP_NUM_THREADS set to 10)
, then the correct configuration is achieved
If user launches less than 10
thread per process (OMP_NUM_THREADS set to less than 10)
, then the cpu resource is wasted.
If user launches more than 10
thread per process (OMP_NUM_THREADS set to more than 10)
, each process still only has access to 10 cores. Some threads must be suspended to let other threads work and then resume. This frequent context switching will likely cause a decrease in performance.