Timing your CUDA C code

You can use the following files:
or the Read Time Stamped Counter (RDTSC), which counts number of processor cycles (you'll have to convert it to seconds depending on your processor's clock):

__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

Timing your PyCuda code

The following is a crude way to time your kernel in PyCuda using python's timeit module:

The Python way:

import timeit
# execute the code you want to time here
print "Execution took %f seconds" % elapsed

NOTE: You may want to convert from seconds to milli/micro/etc.

The PyCuda way (for kernel launches):

mod = drv.SourceModule(""" <your kernel code here> """)
func = mod.get_function("<name of your kernel function here>")
elapsed = func(<your parameters>, time_kernel=True)

more info in the doc

NOTE: You can also use streams

CUDA 3D Texture Example (Gerald Dalley, MIT)

 * Shows a few ways of upscaling an RGB image.

#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>

#include <cuda_runtime.h>
#include <cutil.h>
#include <cutil_inline.h>

#include "imageio.h"
#include "cudaWrapper.h"
#include "timer.h"

using namespace std;

/* Init CUDA */
bool InitCUDA(void){return true;}
#else bool InitCUDA(void)
int count = 0;
int i = 0;

if(count == 0) {
cerr << "There is no device." << endl;
return false;

for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
if(i == count) {
cerr << "There is no device supporting CUDA." << endl;
return false;

cout << "CUDA initialized." << endl;
return true;
#endif ////////////////////////////////////////////////////////////////////////// // Nearest-neighbor resampling ////////////////////////////////////////////////////////////////////////// __global__ static void CudaResize(unsigned char *dO, int wO, int hO,
unsigned char const *dI, int wI, int hI)
int const xO = blockIdx.x*blockDim.x + threadIdx.x;
int const yO = blockIdx.y*blockDim.y + threadIdx.y;
int const xI = (int)(xO * (float)wI / wO);
int const yI = (int)(yO * (float)hI / hO);
dO[xO*3 + yO*3*wO ] = dI[xI*3 + yI*3*wI ];
dO[xO*3 + yO*3*wO + 1] = dI[xI*3 + yI*3*wI + 1];
dO[xO*3 + yO*3*wO + 2] = dI[xI*3 + yI*3*wI + 2];

void cudaResize(unsigned char *hOut, int wO, int hO,
unsigned char const *hIn, int wI, int hI)
unsigned char *dIn = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&dIn, wI*hI*3));
CUDA_SAFE_CALL(cudaMemcpy(dIn, hIn, wI*hI*3, cudaMemcpyHostToDevice));

float const start = cuGetTimer();
unsigned char *dOut = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&dOut, wO*hO*3));

dim3 const block(16, 2, 1);
dim3 const grid(wO/block.x, hO/block.y, 1);
CudaResize<<<grid,block,0>>>(dOut,wO,hO, dIn,wI,hI);
CUT_CHECK_ERROR("resample kernel execution failed\n");

CUDA_SAFE_CALL(cudaMemcpy(hOut, dOut, wO*hO*3, cudaMemcpyDeviceToHost));
cout << "...inner call time: " << (cuGetTimer() - start) << " (ms)" << endl;

////////////////////////////////////////////////////////////////////////// // Bilinear interpolation for upscaling, using 3D textures ////////////////////////////////////////////////////////////////////////// #define NCHANNELS 3 // Currently, CUDA requires all textures to be file-scoped texture<float, NCHANNELS, cudaReadModeElementType> texIn;

// Bilinear interpolation using a CUDA texture. Converts an input // float texture to an output C-style packed multichannel image. __global__ static void CudaBilinearInterpTex(unsigned char *dO, int wO, int hO,
int wI, int hI)
// Output coordinates int const xO = blockIdx.x*blockDim.x + threadIdx.x;
int const yO = blockIdx.y*blockDim.y + threadIdx.y;
int const zO = threadIdx.z;
if (xO < wO && yO < hO) {
int idx = yO*wO*NCHANNELS + xO*NCHANNELS + zO;
// The tex3D lookup does the trilinear interpolation for us // automatically (because we told it to do so via // texIn.filterMode). The 0.5f offsets for x and y are partially // personal choice. Read section D.2 in Appendix D of the CUDA // Programming Guide (version 2.0). The 0.5f offset is required // for the z coordinate if we interpret z as the color channel (to // prevent interpolation between color channels). dO[idx] = (uint8)(255 * tex3D(texIn,

// Uses bilinear interpolation to resize the input image. We expect // the input image to be laid out like a stanard C-style packed // multichannel image. Because textures must be allocated at the file // scope, this function only works with 3-channel images right now // (e.g. RGB images). // // Note that a much easier (and faster?) implementation could be // written with a texture<float4, 1, cudaReadModeElementType>, but it // would waste memory since we only need 3 channels, not 4. void cudaBilinearInterpTex(unsigned char *hOut, int wO, int hO,
unsigned char const *hIn, int wI, int hI)
// Transfer the input image to a device texture (yes, this is // ridiculously complicated). Sources: // * // * The simpleTexture3D example in the SDK // First, we must allocate the CUDA 3D array that will hold the // texture on the device. cudaArray *dIn = NULL;
cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>();
int const pwI = wI;
int const phI = hI;
int const pdI = NCHANNELS;
cudaExtent const ext = {pwI, phI, pdI};
CUDA_SAFE_CALL(cudaMalloc3DArray(&dIn, &floatTex, ext));

// We need to copy the image data to the CUDA 3D array on the // device. There are two important issues here. // // First, we want to permute some of the array dimensions so that we // can have threadIdx.x, threadIdx.y and threadIdx.z correspond to // the texture x, y, and z coordinates, where z indexes the color // channel. We do this to preserve our sanity and to also work well // with the severe limits on blockDim.z's maximum value. // // Second, our input data is an unsigned char image, but the texture // unit requires float textures when doing linear interpolation. // So, we must do a type conversion too. // // One might consider copying the original unsigned array to the // device and then let the device do the permutation and/or type // conversion. Doing the permutation on the device would probably // run into a lot of coalescing conflicts. A Core i7 Extreme 965 // has a 73ns random access memory latency. An 576MHz GTX 260 with // 300-cycle latency suffers a 521ns latency (300/.576). Given // these numbers, we suspect it's faster to do the permutation on // the CPU (but this is untested). Secondly, textures are read-only // on the device side. It might be possible to have one kernel fill // the CUDA array, then have the CPU bind the array to the texture, // then switch back to the current interpolation kernel. We have // not tested that approach. float const fillValueForDebugging = 0.25f;
vector<float> hInV(pwI*phI*pdI, fillValueForDebugging);
// value scale: unsigned chars are [0...255]. float textures must // be in [0,1) float const s = 1/255.f;
for (int y=0; y<hI; y++) {
for (int x=0; x<wI; x++) {
for (int c=0; c<NCHANNELS; c++) {
// Note the different indexing: this permutes the dimensions // so that we can map well to the grid and block // architecture of current CUDA devices. hInV[c*pwI*hI + y*pwI + x] = hIn[y*wI*NCHANNELS + x*NCHANNELS + c] * s; } } } } cudaMemcpy3DParms copyParams = {0};
copyParams.extent = make_cudaExtent(pwI, phI, pdI);
copyParams.kind = cudaMemcpyHostToDevice;
copyParams.dstArray = dIn;
// The pitched pointer is really tricky to get right. We give the // pitch of a row, then the number of elements in a row, then the // height, and we omit the 3rd dimension. copyParams.srcPtr = make_cudaPitchedPtr( (void*)&hInV[0], ext.width*sizeof(float), ext.width, ext.height);

// Now, we're ready to turn the CUDA 3D array into a texture. CUDA_SAFE_CALL(cudaBindTextureToArray(texIn, dIn, floatTex)); // Customize the texture to do trilinear interpolation. Change this // to cudaFilterModePoint for nearest neighbor sampling. texIn.filterMode = cudaFilterModeLinear; for (int dim=0; dim<3; dim++) {
texIn.addressMode[dim] = cudaAddressModeClamp;

// Setup and execute the interpolation kernel float const start = cuGetTimer();
unsigned char *dOut = NULL; // Create device space for the interpolated image CUDA_SAFE_CALL(cudaMalloc((void**)&dOut, wO*hO*NCHANNELS));
// Now call the kernel dim3 const block(16, 2, NCHANNELS);
dim3 const grid(int(ceil(float(wO)/block.x)),
int(ceil(float(hO/block.y))), 1);
CudaBilinearInterpTex<<<grid,block>>>(dOut,wO,hO, wI,hI);
CUT_CHECK_ERROR("bilinear interpolation kernel execution failed\n");

// Get the data out and clean up CUDA_SAFE_CALL(cudaMemcpy(hOut, dOut, wO*hO*NCHANNELS, cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaFree(dOut)); cout << "...inner call time: " << (cuGetTimer() - start) << " (ms)" << endl;

Nicolas Pinto,
Jan 17, 2009, 9:18 PM
Nicolas Pinto,
Jan 17, 2009, 9:18 PM