// modified saxpy example from https://devblogs.nvidia.com/parallelforall/easy-introduction-cuda-c-and-c/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
void saxpy(int n, float a, float *x, float *y)
{
#pragma omp parallel for
for (int i = 0; i < n; ++i)
y[i] = a*x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
x = (float*) malloc(N * sizeof(float));
y = (float*) malloc(N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
saxpy(N, 2.0f, x, y);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError);
}
To compile an OpenMP program, please load the following modules:
$ module list
Currently Loaded Modules:
1) slurm/17.02 2) gcc/4
You can still compile it manually:
$ gcc -lm -fopenmp -std=c99 simple_omp.c -o simple_omp
The flag -fopenmp
instructs the compiler to compile code into an OpenMP program.
Please do not forget it since the compilation may still succeed but the compiler may ignore all OpenMP directives.
For different compilers, you will also have to give different options to enable OpenMP. Here is a list of what you may see on Turing.
compiler | options |
---|---|
GCC | -fopenmp |
LLVM | -fopenmp |
ICC | -qopenmp |
PGI | -mp |
For a larger project, we still recommend using a Makefile to manage your build process.
EXT = c
SRCS = $(shell find src -name '*.$(EXT)')
OBJS = $(SRCS:src/%.$(EXT)=build/%.o)
BIN = simple_omp
CC = gcc
LD = gcc
CFLAGS = -fopenmp -std=c99 -O2
LDFLAGS = -fopenmp -lm
all: $(BIN)
$(BIN): $(OBJS)
$(LD) $(LDFLAGS) $(OBJS) -o $(BIN)
build/%.o: src/%.$(EXT)
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm build/*.o
rm $(BIN)
Below is a list of changes to the submission script that are worth noticing:
CPUs Per Task --cpus-per-task
A openmp program requires a single process but multiple threads. This configuration can be delivered by using the --cpus-per-task
option to Slurm.
Environment Variable OMP_NUM_THREADS
OpenMP uses a number of environment variables to control its behavior. OMP_NUM_THREADS
is, perhaps, the most important one. It tells your application how many threads it can have and it should always be equal to --cpus-per-task.
You can find more detailed information regarding OpenMP Environment Variables here.
#!/bin/bash
#SBATCH --job-name=simple_omp
#SBATCH --output=output
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
enable_lmod
module load gcc/4
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
./simple_omp
The same procedure for submitting a job and reviewing a job's status is listed under the MPI section. Please read MPI Programming