CUDA programming

// Simple Hello World code with gpu call

// host - cpu and its memory (ram)

// device - gpu and its memory (vram)

// steps:

// 1.) copy input data from cpu memory to gpu memory

// 2.) load gpu code and execute it

// 3.) copy results from gpu memory to cpu memory

// * host/device communication occurs across PCI Bus

#include <iostream>

#include <math.h>

//__global__ indicates that the function runs on the device (gpu) (processed by NVIDIA compiler)

// this function is called from the host code (main) which runs on cpu (processed by host compiler: gcc, cl.exe, etc...)

__global__ void mykernel(void) {

}

int main(void) {

mykernel << <1, 1 >> > (); // the triple brackets mark a call from the host to the device ("kernel launch")

printf("Hello World!\n");

return 0;

}

// Adding two vectors - introduction to blocks

#include <iostream>

#include <math.h>

#include <memory>

#include <assert.h>

#include <cuda.h>

#include <stdio.h>

#include <stdlib.h>

#include <stddef.h>

#include <time.h>

void random_ints(int* a, int n){

int i;

for (i = 0; i < n; ++i)

a[i] = rand() % 5000;

}

// we are using pointers for variables (if x = 10 then *x = value stored at memory address 10)

// because this function runs on the device, the pointers refer to device memory

__global__ void add(int* a, int* b, int* c) {

c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];

}

#define N 512

int main(void) {

int* a, * b, * c; // host copies of a, b, c

int* d_a, * d_b, * d_c; // device copies of a, b, c

int size = N * sizeof(int);

// Alloc space for device copies of a, b, c

cudaMalloc((void**)&d_a, size);

cudaMalloc((void**)&d_b, size);

cudaMalloc((void**)&d_c, size);

// Alloc space for host copies of a, b, c and setup input values

a = (int*)malloc(size); random_ints(a, N);

b = (int*)malloc(size); random_ints(b, N);

c = (int*)malloc(size);

// Copy inputs to device

cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU with N blocks

add << <N, 1 >> > (d_a, d_b, d_c);

// Copy result back to host

cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

// Cleanup

free(a);

free(b);

free(c);

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

return 0;

}

Page updated

Google Sites

Report abuse

CUDA programming

Get in touch at jamescrouse3@gmail.com or jcrouse.jobs@gmail.com