// saxpy example from https://devblogs.nvidia.com/parallelforall/easy-introduction-cuda-c-and-c/
#include <stdio.h>
#include <stdlib.h>
__global__
void saxpy(int n, float a, float * x, float * y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) y[i] = a * x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y, *d_x, *d_y;
x = (float*) malloc(N * sizeof(float));
y = (float*) malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError);
}
CUDA Toolkit only supports certain compilers as the nvcc
host compiler.
On Turing, we offer:
On Wahab, we offer:
To compile a CUDA program, please load the following modules:
$ module list
Currently Loaded Modules:
1) Slurm/17.02 2) gcc/4 3) cuda/8.0
If you are planing on using the Fortran programming language with CUDA, please load the PGI compiler instead:
$ module load pgi/16
Lmod is automatically replacing "gcc/4" with "pgi/16"
Due to MODULEPATH changes the following have been reloaded:
1) cuda/8.0
$ module list
Currently Loaded Modules:
1) Slurm/17.02 2) binutils/2.28 3) pgi/16 4) cuda-supplement/8.0 5) cuda/8.0
Please notice that the Lmod
system automatically unloads gcc
and load pgi
for you.
It also loads required support modules for pgi
, in this case, binutils/2.28
and cuda-supplement/8.0
.
You can still compile it manually:
$ nvcc simple_cuda.cu -o simple_cuda
For a larger project, we still recommend using a Makefile to manage your build process
EXT = cu
SRCS = $(shell find src -name '*.$(EXT)')
OBJS = $(SRCS:src/%.$(EXT)=build/%.o)
BIN = simple_cuda
CC = nvcc
LD = nvcc
CFLAGS = -O2
LDFLAGS =
all: $(BIN)
$(BIN): $(OBJS)
$(LD) $(LDFLAGS) $(OBJS) -o $(BIN)
build/%.o: src/%.$(EXT)
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm build/*.o
rm $(BIN)
Below is a list of changes to the submission script that is worth noticing. This submission works for a Single Host CUDA program.
--gres=gpu:1
request 1 gpu device
--partition=gpu
request job to be submitted to GPU partition
#!/bin/bash
#SBATCH --job-name=simple_cuda
#SBATCH --output=output
#SBATCH --partition=gpu
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --gres=gpu:1
module load gcc/4
module load cuda/9.2
./simple_cuda
Submitting and reviewing your job is the same procedure as the simple MPI program section. Please read MPI Programming