Timing your CUDA C code You can use the following files:
cuTimer.cu cuTimer.h or the Read Time Stamped Counter (RDTSC), which counts number of processor cycles (you'll have to convert it to seconds depending on your processor's clock):
__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
Timing your PyCuda code The following is a crude way to time your kernel in PyCuda using python's timeit module:
The Python way:
import timeit t1=timeit.time.time() # execute the code you want to time here t2=timeit.time.time() elapsed=t2-t1 print "Execution took %f seconds" % elapsed
NOTE: You may want to convert from seconds to milli/micro/etc.
The PyCuda way (for kernel launches):
mod = drv.SourceModule(""" <your kernel code here> """) func = mod.get_function("<name of your kernel function here>") elapsed = func(<your parameters>, time_kernel=True)
more info in the doc
NOTE: You can also use streams
CUDA 3D Texture Example (Gerald Dalley, MIT)
/********************************************************************
* cudaWrapper.cu
* Shows a few ways of upscaling an RGB image.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cutil_inline.h>
#include "imageio.h"
#include "cudaWrapper.h"
#include "timer.h"
using namespace std;
/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;} #else
bool InitCUDA(void) { int count = 0; int i = 0;
cudaGetDeviceCount(&count); if(count == 0) { cerr << "There is no device." << endl; return false; }
for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } } if(i == count) { cerr << "There is no device supporting CUDA." << endl; return false; } cudaSetDevice(i);
cout << "CUDA initialized." << endl; return true; } #endif
//////////////////////////////////////////////////////////////////////////
// Nearest-neighbor resampling
//////////////////////////////////////////////////////////////////////////
__global__ static void CudaResize(unsigned char *dO, int wO, int hO, unsigned char const *dI, int wI, int hI) { int const xO = blockIdx.x*blockDim.x + threadIdx.x; int const yO = blockIdx.y*blockDim.y + threadIdx.y; int const xI = (int)(xO * (float)wI / wO); int const yI = (int)(yO * (float)hI / hO); dO[xO*3 + yO*3*wO ] = dI[xI*3 + yI*3*wI ]; dO[xO*3 + yO*3*wO + 1] = dI[xI*3 + yI*3*wI + 1]; dO[xO*3 + yO*3*wO + 2] = dI[xI*3 + yI*3*wI + 2]; }
void cudaResize(unsigned char *hOut, int wO, int hO, unsigned char const *hIn, int wI, int hI) { unsigned char *dIn = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&dIn, wI*hI*3)); CUDA_SAFE_CALL(cudaMemcpy(dIn, hIn, wI*hI*3, cudaMemcpyHostToDevice));
float const start = cuGetTimer(); unsigned char *dOut = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&dOut, wO*hO*3));
dim3 const block(16, 2, 1); dim3 const grid(wO/block.x, hO/block.y, 1); CudaResize<<<grid,block,0>>>(dOut,wO,hO, dIn,wI,hI); CUT_CHECK_ERROR("resample kernel execution failed\n");
CUDA_SAFE_CALL(cudaMemcpy(hOut, dOut, wO*hO*3, cudaMemcpyDeviceToHost)); cout << "...inner call time: " << (cuGetTimer() - start) << " (ms)" << endl; CUDA_SAFE_CALL(cudaFree(dOut)); CUDA_SAFE_CALL(cudaFree(dIn)); }
//////////////////////////////////////////////////////////////////////////
// Bilinear interpolation for upscaling, using 3D textures
//////////////////////////////////////////////////////////////////////////
#define NCHANNELS 3
// Currently, CUDA requires all textures to be file-scoped
texture<float, NCHANNELS, cudaReadModeElementType> texIn;
// Bilinear interpolation using a CUDA texture. Converts an input
// float texture to an output C-style packed multichannel image.
__global__ static void CudaBilinearInterpTex(unsigned char *dO, int wO, int hO, int wI, int hI) { // Output coordinates
int const xO = blockIdx.x*blockDim.x + threadIdx.x; int const yO = blockIdx.y*blockDim.y + threadIdx.y; int const zO = threadIdx.z; if (xO < wO && yO < hO) { int idx = yO*wO*NCHANNELS + xO*NCHANNELS + zO; // The tex3D lookup does the trilinear interpolation for us
// automatically (because we told it to do so via
// texIn.filterMode). The 0.5f offsets for x and y are partially
// personal choice. Read section D.2 in Appendix D of the CUDA
// Programming Guide (version 2.0). The 0.5f offset is required
// for the z coordinate if we interpret z as the color channel (to
// prevent interpolation between color channels).
dO[idx] = (uint8)(255 * tex3D(texIn, float(xO)*wI/wO+0.5f, float(yO)*hI/hO+0.5f, zO+0.5f)); } }
// Uses bilinear interpolation to resize the input image. We expect
// the input image to be laid out like a stanard C-style packed
// multichannel image. Because textures must be allocated at the file
// scope, this function only works with 3-channel images right now
// (e.g. RGB images).
//
// Note that a much easier (and faster?) implementation could be
// written with a texture<float4, 1, cudaReadModeElementType>, but it
// would waste memory since we only need 3 channels, not 4.
void cudaBilinearInterpTex(unsigned char *hOut, int wO, int hO, unsigned char const *hIn, int wI, int hI) { // Transfer the input image to a device texture (yes, this is
// ridiculously complicated). Sources:
// * http://forums.nvidia.com/index.php?act=Print&client=printer&f=71&t=74321
// * The simpleTexture3D example in the SDK
// First, we must allocate the CUDA 3D array that will hold the
// texture on the device.
cudaArray *dIn = NULL; cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>(); int const pwI = wI; int const phI = hI; int const pdI = NCHANNELS; cudaExtent const ext = {pwI, phI, pdI}; CUDA_SAFE_CALL(cudaMalloc3DArray(&dIn, &floatTex, ext));
// We need to copy the image data to the CUDA 3D array on the
// device. There are two important issues here.
//
// First, we want to permute some of the array dimensions so that we
// can have threadIdx.x, threadIdx.y and threadIdx.z correspond to
// the texture x, y, and z coordinates, where z indexes the color
// channel. We do this to preserve our sanity and to also work well
// with the severe limits on blockDim.z's maximum value.
//
// Second, our input data is an unsigned char image, but the texture
// unit requires float textures when doing linear interpolation.
// So, we must do a type conversion too.
//
// One might consider copying the original unsigned array to the
// device and then let the device do the permutation and/or type
// conversion. Doing the permutation on the device would probably
// run into a lot of coalescing conflicts. A Core i7 Extreme 965
// has a 73ns random access memory latency. An 576MHz GTX 260 with
// 300-cycle latency suffers a 521ns latency (300/.576). Given
// these numbers, we suspect it's faster to do the permutation on
// the CPU (but this is untested). Secondly, textures are read-only
// on the device side. It might be possible to have one kernel fill
// the CUDA array, then have the CPU bind the array to the texture,
// then switch back to the current interpolation kernel. We have
// not tested that approach.
float const fillValueForDebugging = 0.25f; vector<float> hInV(pwI*phI*pdI, fillValueForDebugging); { // value scale: unsigned chars are [0...255]. float textures must
// be in [0,1)
float const s = 1/255.f; for (int y=0; y<hI; y++) { for (int x=0; x<wI; x++) { for (int c=0; c<NCHANNELS; c++) { // Note the different indexing: this permutes the dimensions
// so that we can map well to the grid and block
// architecture of current CUDA devices.
hInV[c*pwI*hI + y*pwI + x] =
hIn[y*wI*NCHANNELS + x*NCHANNELS + c] * s;
}
}
}
}
cudaMemcpy3DParms copyParams = {0}; copyParams.extent = make_cudaExtent(pwI, phI, pdI); copyParams.kind = cudaMemcpyHostToDevice; copyParams.dstArray = dIn; // The pitched pointer is really tricky to get right. We give the
// pitch of a row, then the number of elements in a row, then the
// height, and we omit the 3rd dimension.
copyParams.srcPtr = make_cudaPitchedPtr(
(void*)&hInV[0], ext.width*sizeof(float), ext.width, ext.height); CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
// Now, we're ready to turn the CUDA 3D array into a texture.
CUDA_SAFE_CALL(cudaBindTextureToArray(texIn, dIn, floatTex));
// Customize the texture to do trilinear interpolation. Change this
// to cudaFilterModePoint for nearest neighbor sampling.
texIn.filterMode = cudaFilterModeLinear;
for (int dim=0; dim<3; dim++) { texIn.addressMode[dim] = cudaAddressModeClamp; }
// Setup and execute the interpolation kernel
float const start = cuGetTimer(); unsigned char *dOut = NULL; // Create device space for the interpolated image
CUDA_SAFE_CALL(cudaMalloc((void**)&dOut, wO*hO*NCHANNELS)); // Now call the kernel
dim3 const block(16, 2, NCHANNELS); dim3 const grid(int(ceil(float(wO)/block.x)), int(ceil(float(hO/block.y))), 1); CudaBilinearInterpTex<<<grid,block>>>(dOut,wO,hO, wI,hI); CUT_CHECK_ERROR("bilinear interpolation kernel execution failed\n");
// Get the data out and clean up
CUDA_SAFE_CALL(cudaMemcpy(hOut, dOut, wO*hO*NCHANNELS,
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(dOut));
cout << "...inner call time: " << (cuGetTimer() - start) << " (ms)" << endl; CUDA_SAFE_CALL(cudaUnbindTexture(texIn)); CUDA_SAFE_CALL(cudaFreeArray(dIn)); }
|
Attachments (2)
-
cuTimer.cu - on Jan 17, 2009 9:18 PM by Nicolas Pinto (version 1)
1k
Download
-
cuTimer.h - on Jan 17, 2009 9:18 PM by Nicolas Pinto (version 1)
1k
Download
|