Program #1: Hello World
File name: helloworld.cu
Source Code:
#include<stdio.h>
__global__ void kernel(void)
{
}
int main(void)
{
kernel<<<1,1>>>();
printf(“Hello World \n”);
return 0;
}
Compilation and Output: nvcc –o helloworld helloworld.cu
Output:./helloworld.out
Hello World
Program #2: Parameter Passing
File name: add.cu
Source Code:
#include<stdio.h>
__global__ void add(int a, int b,int *c)
{
*c=a+b;
}
int main(void)
{
int c;
int *dev_c;
cudaMalloc((void**)&dev_c,sizeof(int));
add<<<1,1>>>(2,7,dev_c);
cudaMemcpy(&c,dev_c,sizeof(int),cudaMemcpyDeviceToHost);
printf("2+7=%d\n",c);
cudaFree(dev_c);
return 0;
}
Output:
2+7=9
Program #3: CUDA device properties
Source Code:
#include<stdio.h>
int main(void)
{
cudaDeviceProp prop;
int count;
cudaGetDeviceCount(&count);
for(int i=0;i<count;i++)
{
cudaGetDeviceProperties(&prop,i);
printf("General information for device %d\n",i);
printf("Name: %s\n",prop.name);
printf("Compute Capability : %d %d\n",prop.major,prop.minor);
printf("Clock rate: %d\n",prop.clockRate);
printf("Total global memory %ld\n",prop.totalGlobalMem);
printf("Total constant memory %ld\n",prop.totalConstMem);
printf("Multiprocessor count %d\n",prop.multiProcessorCount);
printf("Shared memory per mp %d\n",prop.sharedMemPerBlock);
printf("Register per mp %d\n",prop.regsPerBlock);
printf("Threads in warp %d\n",prop.warpSize);
printf("Maximum Threads per block %d\n",prop.maxThreadsPerBlock);
printf("Maximum Thread dimensions (%d,%d,%d)\n",prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
printf("Maximum grid dimensions (%d,%d,%d)\n",prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
}
}
Output:
General information for device 0
Name: Tesla M2050
Compute Capability : 2 0
Clock rate: 1147000
Total global memory 2817982464
Total constant memory 65536
Multiprocessor count 14
Shared memory per mp 49152
Register per mp 32768
Threads in warp 32
Maximum Threads per block 1024
Maximum Thread dimensions (1024,1024,64)
Maximum grid dimensions (65535,65535,65535)
General information for device 1
Name: Tesla M2050
Compute Capability : 2 0
Clock rate: 1147000
Total global memory 2817982464
Total constant memory 65536
Multiprocessor count 14
Shared memory per mp 49152
Register per mp 32768
Threads in warp 32
Maximum Threads per block 1024
Maximum Thread dimensions (1024,1024,64)
Maximum grid dimensions (65535,65535,65535)
Program #4: Print device properties
Source Code:
#include <stdio.h>
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
int main()
{
// Number of CUDA devices
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
printDevProp(devProp);
}
printf("\nPress any key to exit...");
char c;
scanf("%c", &c);
return 0;
}
Output:
CUDA Device Query...
There are 2 CUDA devices.
CUDA Device #0
Major revision number : 2
Minor revision number : 0
Name : Tesla M2050
Total global memory : 2817982464
Total shared memory per block : 49152
Total registers per block : 32768
Warp size : 32
Maximum memory pitch : 2147483647
Maximum threads per block : 1024
Maximum dimension 0 of block : 1024
Maximum dimension 1 of block : 1024
Maximum dimension 2 of block : 64
Maximum dimension 0 of grid : 65535
Maximum dimension 1 of grid : 65535
Maximum dimension 2 of grid : 65535
Clock rate : 1147000
Total constant memory : 65536
Texture alignment : 512
Concurrent copy and execution : Yes
Number of multiprocessors : 14
Kernel execution timeout : No
CUDA Device #1
Major revision number : 2
Minor revision number : 0
Name : Tesla M2050
Total global memory : 2817982464
Total shared memory per block : 49152
Total registers per block : 32768
Warp size : 32
Maximum memory pitch : 2147483647
Maximum threads per block : 1024
Maximum dimension 0 of block : 1024
Maximum dimension 1 of block : 1024
Maximum dimension 2 of block : 64
Maximum dimension 0 of grid : 65535
Maximum dimension 1 of grid : 65535
Maximum dimension 2 of grid : 65535
Clock rate : 1147000
Total constant memory : 65536
Texture alignment : 512
Concurrent copy and execution : Yes
Number of multiprocessors : 14
Kernel execution timeout : No
Press any key to exit...
Program #5: CPU Vector sum
Source Code:
#include<stdio.h>
#include<time.h>
#define N 10000
void add(int *a,int *b,int *c)
{
int tid=0;
while (tid<N)
{
c[tid]=a[tid]+b[tid];
tid +=1;
}
}
int main(void)
{
int a[N],b[N],c[N];
for(int i=0;i<N;i++)
{
a[i]=-i;
b[i]=i*i;
}
clock_t begin = clock();
add(a,b,c);
clock_t end = clock();
double runtime = ((double)(end-begin))/CLOCKS_PER_SEC;
printf("Clocks per second %d\n",CLOCKS_PER_SEC);
printf("Run time = %lf\n",runtime);
for (int i=0;i<N;i++)
{
printf("%d+%d=%d\n",a[i],b[i],c[i]);
}
return 0;
}
Output:
Clocks per second 1000000
Run time = 0.000000
0+0=0
-1+1=0
-2+4=2
-3+9=6
-4+16=12
-5+25=20
-6+36=30
-7+49=42
-8+64=56
-9+81=72
Program #6:GPU Vector Sum (N blocks one thread each)
Source Code:
#include<stdio.h>
#include<time.h>
#define N 10
__global__ void add(int *a,int *b,int *c)
{
int tid =blockIdx.x;
if(tid<N)
c[tid]=a[tid]+b[tid];
}
int main(void)
{
int a[N],b[N],c[N];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void**)&dev_a,N*sizeof(int));
cudaMalloc((void**)&dev_b,N*sizeof(int));
cudaMalloc((void**)&dev_c,N*sizeof(int));
for(int i=0;i<N; i++)
{
a[i]=-i;
b[i]=i*i;
}
clock_t begin=clock();
cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a,dev_b,dev_c);
cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
clock_t end=clock();
double runtime=((double)(end-begin))/CLOCKS_PER_SEC;
printf("runtime = %lf seconds\n",runtime);
for(int i=0;i<N;i++)
{
printf("%d+%d=%d\n", a[i],b[i],c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Output:
runtime = 0.000000 seconds
0+0=0
-1+1=0
-2+4=2
-3+9=6
-4+16=12
-5+25=20
-6+36=30
-7+49=42
-8+64=56
-9+81=72
Program #7:GPU Vector Sum using threads (one block N threads)
Source Code:
#include<stdio.h>
#include<time.h>
#define N 10
__global__ void add(int *a,int *b,int *c)
{
int tid =blockIdx.x;
if(tid<N)
c[tid]=a[tid]+b[tid];
}
int main(void)
{
int a[N],b[N],c[N];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void**)&dev_a,N*sizeof(int));
cudaMalloc((void**)&dev_b,N*sizeof(int));
cudaMalloc((void**)&dev_c,N*sizeof(int));
for(int i=0;i<N; i++)
{
a[i]=-i;
b[i]=i*i;
}
clock_t begin=clock();
cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
add<<<1,N>>>(dev_a,dev_b,dev_c);
cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
clock_t end=clock();
double runtime=((double)(end-begin))/CLOCKS_PER_SEC;
printf("runtime = %lf seconds\n",runtime);
for(int i=0;i<N;i++)
{
printf("%d+%d=%d\n", a[i],b[i],c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Output:
runtime = 0.000000 seconds
0+0=0
-1+1=0
-2+4=2
-3+9=6
-4+16=12
-5+25=20
-6+36=30
-7+49=42
-8+64=56
-9+81=72
Program #8: Block-Thread demo1
Source Code:
#include<stdio.h>
__global__ void demo(int *a)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
a[idx]=idx;
}
int main()
{
int a[12];
int *dev_a;
cudaMalloc((void**)&dev_a, 12*sizeof(int));
demo<<<4,3>>>(dev_a);
cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<12;i++)
printf("%d ",a[i]);
return 0;
}
Output:
0 1 2 3 4 5 6 7 8 9 10 11
Program #9: Block-Thread demo2
Source Code:
#include<stdio.h>
__global__ void demo(int *a)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
a[idx]=blockIdx.x;
}
int main()
{
int a[12];
int *dev_a;
cudaMalloc((void**)&dev_a, 12*sizeof(int));
demo<<<4,3>>>(dev_a);
cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<12;i++)
printf("%d ",a[i]);
return 0;
}
Output:
0 0 0 1 1 1 2 2 2 3 3 3
Program #10: Block-Thread demo2
Source Code:
#include<stdio.h>
__global__ void demo(int *a)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
a[idx]=threadIdx.x;
}
int main()
{
int a[12];
int *dev_a;
cudaMalloc((void**)&dev_a, 12*sizeof(int));
demo<<<4,3>>>(dev_a);
cudaMemcpy(&a,dev_a,12*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<12;i++)
printf("%d ",a[i]);
return 0;
}
Output:
0 1 2 0 1 2 0 1 2 0 1 2
Program #11: Squaring each number in an array
Source Code:
#include <stdio.h>
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
int main(void)
{
float *a_h, *a_d;
const int N = 10;
size_t size = N * sizeof(float);
a_h = (float *)malloc(size);
cudaMalloc((void **) &a_d, size);
for (int i=0; i<N; i++)
a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++)
printf("%d %f\n", i, a_h[i]);
free(a_h);
cudaFree(a_d);
}
Output:
0 0.000000
1 1.000000
2 4.000000
3 9.000000
4 16.000000
5 25.000000
6 36.000000
7 49.000000
8 64.000000
9 81.000000
Program #14 GPU Timer
#include <stdio.h>
#include "gputimer.h"
#define N 10
__global__ void add(int *a,int *b,int *c)
{
int tid =blockIdx.x;
if(tid<N)
c[tid]=a[tid]+b[tid];
}
int main(void)
{
GpuTimer timer;
int a[N],b[N],c[N];
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void**)&dev_a,N*sizeof(int));
cudaMalloc((void**)&dev_b,N*sizeof(int));
cudaMalloc((void**)&dev_c,N*sizeof(int));
for(int i=0;i<N; i++)
{
a[i]=-i;
b[i]=i*i;
}
cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
timer.Start();
add<<<N,1>>>(dev_a,dev_b,dev_c);
timer.Stop();
cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
printf("Time elapsed = %g ms\n", timer.Elapsed());
for(int i=0;i<N;i++)
{
printf("%d+%d=%d\n", a[i],b[i],c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
********This Page is in under construction******