GPU Programming with CUDA

Program #1: Hello World

File name: helloworld.cu

Source Code:

#include<stdio.h>

__global__ void kernel(void)

{

}

int main(void)

{

kernel<<<1,1>>>();

printf(“Hello World \n”);

return 0;

}

Compilation and Output: nvcc –o helloworld helloworld.cu

Output:./helloworld.out

Hello World

Program #2: Parameter Passing

File name: add.cu

Source Code:

#include<stdio.h>

__global__ void add(int a, int b,int *c)

{

*c=a+b;

}

int main(void)

{

int c;

int *dev_c;

cudaMalloc((void**)&dev_c,sizeof(int));

add<<<1,1>>>(2,7,dev_c);

cudaMemcpy(&c,dev_c,sizeof(int),cudaMemcpyDeviceToHost);

printf("2+7=%d\n",c);

cudaFree(dev_c);

return 0;

}

Output:

2+7=9

Program #3: CUDA device properties

Source Code:

#include<stdio.h>

int main(void)

{

cudaDeviceProp prop;

int count;

cudaGetDeviceCount(&count);

for(int i=0;i<count;i++)

{

cudaGetDeviceProperties(&prop,i);

printf("General information for device %d\n",i);

printf("Name: %s\n",prop.name);

printf("Compute Capability : %d %d\n",prop.major,prop.minor);

printf("Clock rate: %d\n",prop.clockRate);

printf("Total global memory %ld\n",prop.totalGlobalMem);

printf("Total constant memory %ld\n",prop.totalConstMem);

printf("Multiprocessor count %d\n",prop.multiProcessorCount);

printf("Shared memory per mp %d\n",prop.sharedMemPerBlock);

printf("Register per mp %d\n",prop.regsPerBlock);

printf("Threads in warp %d\n",prop.warpSize);

printf("Maximum Threads per block %d\n",prop.maxThreadsPerBlock);

printf("Maximum Thread dimensions (%d,%d,%d)\n",prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);

printf("Maximum grid dimensions (%d,%d,%d)\n",prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);

}

Output:

General information for device 0

Name: Tesla M2050

Compute Capability : 2 0

Clock rate: 1147000

Total global memory 2817982464

Total constant memory 65536

Multiprocessor count 14

Shared memory per mp 49152

Threads in warp 32

Maximum Threads per block 1024

Maximum Thread dimensions (1024,1024,64)

Maximum grid dimensions (65535,65535,65535)

General information for device 1

Name: Tesla M2050

Compute Capability : 2 0

Clock rate: 1147000

Total global memory 2817982464

Total constant memory 65536

Multiprocessor count 14

Shared memory per mp 49152

Threads in warp 32

Maximum Threads per block 1024

Maximum Thread dimensions (1024,1024,64)

Maximum grid dimensions (65535,65535,65535)

Program #4: Print device properties

Source Code:

#include <stdio.h>

// Print device properties

void printDevProp(cudaDeviceProp devProp)

{

printf("Major revision number: %d\n", devProp.major);

printf("Minor revision number: %d\n", devProp.minor);

printf("Name: %s\n", devProp.name);

printf("Total global memory: %u\n", devProp.totalGlobalMem);

printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);

printf("Total registers per block: %d\n", devProp.regsPerBlock);

printf("Warp size: %d\n", devProp.warpSize);

printf("Maximum memory pitch: %u\n", devProp.memPitch);

printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);

for (int i = 0; i < 3; ++i)

printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);

for (int i = 0; i < 3; ++i)

printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);

printf("Clock rate: %d\n", devProp.clockRate);

printf("Total constant memory: %u\n", devProp.totalConstMem);

printf("Texture alignment: %u\n", devProp.textureAlignment);

printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));

printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);

printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));

return;

}

int main()

{

// Number of CUDA devices

int devCount;

cudaGetDeviceCount(&devCount);

printf("CUDA Device Query...\n");

printf("There are %d CUDA devices.\n", devCount);

// Iterate through devices

for (int i = 0; i < devCount; ++i)

{

// Get device properties

printf("\nCUDA Device #%d\n", i);

cudaDeviceProp devProp;

cudaGetDeviceProperties(&devProp, i);

printDevProp(devProp);

}

printf("\nPress any key to exit...");

char c;

scanf("%c", &c);

return 0;

}

Output:

CUDA Device Query...

There are 2 CUDA devices.

CUDA Device #0

Major revision number : 2

Minor revision number : 0

Name : Tesla M2050

Total global memory : 2817982464

Total shared memory per block : 49152

Total registers per block : 32768

Warp size : 32

Maximum memory pitch : 2147483647

Maximum threads per block : 1024

Maximum dimension 0 of block : 1024

Maximum dimension 1 of block : 1024

Maximum dimension 2 of block : 64

Maximum dimension 0 of grid : 65535

Maximum dimension 1 of grid : 65535

Maximum dimension 2 of grid : 65535

Clock rate : 1147000

Total constant memory : 65536

Texture alignment : 512

Concurrent copy and execution : Yes

Number of multiprocessors : 14

Kernel execution timeout : No

CUDA Device #1

Major revision number : 2

Minor revision number : 0

Name : Tesla M2050

Total global memory : 2817982464

Total shared memory per block : 49152

Total registers per block : 32768

Warp size : 32

Maximum memory pitch : 2147483647

Maximum threads per block : 1024

Maximum dimension 0 of block : 1024

Maximum dimension 1 of block : 1024

Maximum dimension 2 of block : 64

Maximum dimension 0 of grid : 65535

Maximum dimension 1 of grid : 65535

Maximum dimension 2 of grid : 65535

Clock rate : 1147000

Total constant memory : 65536

Texture alignment : 512

Concurrent copy and execution : Yes

Number of multiprocessors : 14

Kernel execution timeout : No

Press any key to exit...

Program #5: CPU Vector sum

Source Code:

#include<stdio.h>

#include<time.h>

#define N 10000

void add(int *a,int *b,int *c)

{

int tid=0;

while (tid<N)

{

c[tid]=a[tid]+b[tid];

tid +=1;

}

int main(void)

{

int a[N],b[N],c[N];

for(int i=0;i<N;i++)

{

a[i]=-i;

b[i]=i*i;

}

clock_t begin = clock();

add(a,b,c);

clock_t end = clock();

double runtime = ((double)(end-begin))/CLOCKS_PER_SEC;

printf("Clocks per second %d\n",CLOCKS_PER_SEC);

printf("Run time = %lf\n",runtime);

for (int i=0;i<N;i++)

{

printf("%d+%d=%d\n",a[i],b[i],c[i]);

}

return 0;

}

Output:

Clocks per second 1000000

Run time = 0.000000

0+0=0

-1+1=0

-2+4=2

-3+9=6

-4+16=12

-5+25=20

-6+36=30

-7+49=42

-8+64=56

-9+81=72

Program #6:GPU Vector Sum (N blocks one thread each)

Source Code:

#include<stdio.h>

#include<time.h>

#define N 10

__global__ void add(int *a,int *b,int *c)

{

int tid =blockIdx.x;

if(tid<N)

c[tid]=a[tid]+b[tid];

}

int main(void)

{

int a[N],b[N],c[N];

int *dev_a,*dev_b,*dev_c;

cudaMalloc((void**)&dev_a,N*sizeof(int));

cudaMalloc((void**)&dev_b,N*sizeof(int));

cudaMalloc((void**)&dev_c,N*sizeof(int));

for(int i=0;i<N; i++)

{

a[i]=-i;

b[i]=i*i;

}

clock_t begin=clock();

cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

add<<<N,1>>>(dev_a,dev_b,dev_c);

cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);

clock_t end=clock();

double runtime=((double)(end-begin))/CLOCKS_PER_SEC;

printf("runtime = %lf seconds\n",runtime);

for(int i=0;i<N;i++)

{

printf("%d+%d=%d\n", a[i],b[i],c[i]);

}

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_c);

return 0;

}

Output:

runtime = 0.000000 seconds

0+0=0

-1+1=0

-2+4=2

-3+9=6

-4+16=12

-5+25=20

-6+36=30

-7+49=42

-8+64=56

-9+81=72

Program #7:GPU Vector Sum using threads (one block N threads)

Source Code:

#include<stdio.h>

#include<time.h>

#define N 10

__global__ void add(int *a,int *b,int *c)

{

int tid =blockIdx.x;

if(tid<N)

c[tid]=a[tid]+b[tid];

}

int main(void)

{

int a[N],b[N],c[N];

int *dev_a,*dev_b,*dev_c;

cudaMalloc((void**)&dev_a,N*sizeof(int));

cudaMalloc((void**)&dev_b,N*sizeof(int));

cudaMalloc((void**)&dev_c,N*sizeof(int));

for(int i=0;i<N; i++)

{

a[i]=-i;

b[i]=i*i;

}

clock_t begin=clock();

cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

add<<<1,N>>>(dev_a,dev_b,dev_c);

cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);

clock_t end=clock();

double runtime=((double)(end-begin))/CLOCKS_PER_SEC;

printf("runtime = %lf seconds\n",runtime);

for(int i=0;i<N;i++)

{

printf("%d+%d=%d\n", a[i],b[i],c[i]);

}

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_c);

return 0;

}

Output:

runtime = 0.000000 seconds

0+0=0

-1+1=0

-2+4=2

-3+9=6

-4+16=12

-5+25=20

-6+36=30

-7+49=42

-8+64=56

-9+81=72

Program #8: Block-Thread demo1

Source Code:

#include<stdio.h>

__global__ void demo(int *a)

{

int idx=blockIdx.x*blockDim.x+threadIdx.x;

a[idx]=idx;

}

int main()

{

int a[12];

int *dev_a;

cudaMalloc((void**)&dev_a, 12*sizeof(int));

demo<<<4,3>>>(dev_a);

cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);

for(int i=0;i<12;i++)

printf("%d ",a[i]);

return 0;

}

Output:

0 1 2 3 4 5 6 7 8 9 10 11

Program #9: Block-Thread demo2

Source Code:

#include<stdio.h>

__global__ void demo(int *a)

{

int idx=blockIdx.x*blockDim.x+threadIdx.x;

a[idx]=blockIdx.x;

}

int main()

{

int a[12];

int *dev_a;

cudaMalloc((void**)&dev_a, 12*sizeof(int));

demo<<<4,3>>>(dev_a);

cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);

for(int i=0;i<12;i++)

printf("%d ",a[i]);

return 0;

}

Output:

0 0 0 1 1 1 2 2 2 3 3 3

Program #10: Block-Thread demo2

Source Code:

#include<stdio.h>

__global__ void demo(int *a)

{

int idx=blockIdx.x*blockDim.x+threadIdx.x;

a[idx]=threadIdx.x;

}

int main()

{

int a[12];

int *dev_a;

cudaMalloc((void**)&dev_a, 12*sizeof(int));

demo<<<4,3>>>(dev_a);

cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);

for(int i=0;i<12;i++)

printf("%d ",a[i]);

return 0;

}

Output:

0 1 2 0 1 2 0 1 2 0 1 2

Program #11: Squaring each number in an array

Source Code:

#include <stdio.h>

__global__ void square_array(float *a, int N)

{

int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx<N) a[idx] = a[idx] * a[idx];

}

int main(void)

{

float *a_h, *a_d;

const int N = 10;

size_t size = N * sizeof(float);

a_h = (float *)malloc(size);

cudaMalloc((void **) &a_d, size);

for (int i=0; i<N; i++)

a_h[i] = (float)i;

cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

int block_size = 4;

int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

square_array <<< n_blocks, block_size >>> (a_d, N);

cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

for (int i=0; i<N; i++)

printf("%d %f\n", i, a_h[i]);

free(a_h);

cudaFree(a_d);

}

Output:

0 0.000000

1 1.000000

2 4.000000

3 9.000000

4 16.000000

5 25.000000

6 36.000000

7 49.000000

8 64.000000

9 81.000000

Program #14 GPU Timer

#include <stdio.h>

#include "gputimer.h"

#define N 10

__global__ void add(int *a,int *b,int *c)

{

int tid =blockIdx.x;

if(tid<N)

c[tid]=a[tid]+b[tid];

}

int main(void)

{

GpuTimer timer;

int a[N],b[N],c[N];

int *dev_a,*dev_b,*dev_c;

cudaMalloc((void**)&dev_a,N*sizeof(int));

cudaMalloc((void**)&dev_b,N*sizeof(int));

cudaMalloc((void**)&dev_c,N*sizeof(int));

for(int i=0;i<N; i++)

{

a[i]=-i;

b[i]=i*i;

}

cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

timer.Start();

add<<<N,1>>>(dev_a,dev_b,dev_c);

timer.Stop();

cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);

printf("Time elapsed = %g ms\n", timer.Elapsed());

for(int i=0;i<N;i++)

{

printf("%d+%d=%d\n", a[i],b[i],c[i]);

}

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_c);

return 0;

}

********This Page is in under construction******

Google Sites

Report abuse