// modified saxpy example from https://devblogs.nvidia.com/parallelforall/easy-introduction-cuda-c-and-c/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
void saxpy(int n, float a, float *x, float *y)
{
#pragma acc kernels
  for (int i = 0; i < n; ++i)
      y[i] = a*x[i] + y[i];
}
int main(void)
{
        int N = 1<<20;
        float *x, *y;
        x = (float*) malloc(N * sizeof(float));
        y = (float*) malloc(N * sizeof(float));
        for (int i = 0; i < N; i++) {
                x[i] = 1.0f;
                y[i] = 2.0f;
        }
        saxpy(N, 2.0f, x, y);
        float maxError = 0.0f;
        for (int i = 0; i < N; i++)
                maxError = fmax(maxError, abs(y[i] - 4.0f));
        printf("Max error: %f\n", maxError);
}
To compile an OpenACC program, please load the following modules:
$ module load pgi/19 cuda/9.2
$ module list
Currently Loaded Modules:
  1) slurm/20.02   2) binutils/2.32   3) libstdcxx/4   4) pgi/19   5) cuda-supplement/9.2   6) cuda/9.2
You just need to load
pgi/19,cuda/9.2, other modules will be loaded automatically.
You can still compile it manually:
$ pgcc -acc simple_acc.c -o simple_acc
The flag -acc instructs the compiler to compile code into a OpenACC program. Please do not forget it since the compilation may still succeed but the compiler will ignore all OpenACC directives.
For a larger project, we still recommend using a Makefile to manage your build process
EXT  = c
SRCS = $(shell find src -name '*.$(EXT)')
OBJS = $(SRCS:src/%.$(EXT)=build/%.o)
BIN  = simple_acc
CC = pgcc
LD = pgcc
CFLAGS  = -acc -O2
LDFLAGS = -acc
all: $(BIN)
$(BIN): $(OBJS)
        $(LD) $(LDFLAGS) $(OBJS) -o $(BIN)
build/%.o: src/%.$(EXT)
        $(CC) $(CFLAGS) -c $< -o $@
clean:
        rm build/*.o
        rm $(BIN)
Since OpenACC requires GPU resources, the submission procedure is identical to a CUDA program. Please read the CUDA Programming section for details.
#!/bin/bash
#SBATCH --job-name=simple_acc
#SBATCH --output=output
#SBATCH --partition=gpu
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --gres=gpu:1
module load gcc/4
module load cuda/9.2
./simple_acc
Submitting and reviewing your job is the same procedure as the simple MPI program section. Please read MPI Programming