// Simple Hello World code with gpu call
//
// host - cpu and its memory (ram)
// device - gpu and its memory (vram)
//
// steps:
// 1.) copy input data from cpu memory to gpu memory
// 2.) load gpu code and execute it
// 3.) copy results from gpu memory to cpu memory
//
// * host/device communication occurs across PCI Bus
#include <iostream>
#include <math.h>
//__global__ indicates that the function runs on the device (gpu) (processed by NVIDIA compiler)
// this function is called from the host code (main) which runs on cpu (processed by host compiler: gcc, cl.exe, etc...)
__global__ void mykernel(void) {
}
int main(void) {
mykernel << <1, 1 >> > (); // the triple brackets mark a call from the host to the device ("kernel launch")
printf("Hello World!\n");
return 0;
}
// Adding two vectors - introduction to blocks
#include <iostream>
#include <math.h>
#include <memory>
#include <assert.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <time.h>
void random_ints(int* a, int n){
int i;
for (i = 0; i < n; ++i)
a[i] = rand() % 5000;
}
// we are using pointers for variables (if x = 10 then *x = value stored at memory address 10)
// because this function runs on the device, the pointers refer to device memory
__global__ void add(int* a, int* b, int* c) {
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
#define N 512
int main(void) {
int* a, * b, * c; // host copies of a, b, c
int* d_a, * d_b, * d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
cudaMalloc((void**)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int*)malloc(size); random_ints(a, N);
b = (int*)malloc(size); random_ints(b, N);
c = (int*)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add << <N, 1 >> > (d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}