手持ちの計算機の速度向上のために,とうとうCUDAに手を出してしまいました.
アーキテクチャ固有(多数の core と小さなローカルメモリ-)のプログラムを書くのに抵抗があったのですが, 良く考えたら昔も FACOM VP-200 用や Intel PARAGON 用プログラムとかを書いてました. イマドキの超並列に挑戦.
machine spec.
bandwidthTest:
Device 0: GeForce GT 240 Quick Mode Host to Device Bandwidth, 1 Device(s), Paged memory Transfer Size (Bytes) Bandwidth(MB/s) 33554432 1744.5 Device to Host Bandwidth, 1 Device(s), Paged memory Transfer Size (Bytes) Bandwidth(MB/s) 33554432 1210.4 Device to Device Bandwidth, 1 Device(s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 8242.4// CUDA#ifdef USE_CUDA_CONSTANT_MEMORY__constant__ float d_u1[IMAX*JMAX];__global__ void wave2dKernel( float* d_u, int ixmax, int iymax)#else__global__ void wave2dKernel( float* d_u, float* d_u1, int ixmax, int iymax)#endif /* USE_CUDA_CONSTANT_MEMORY */{ const unsigned int tid = threadIdx.x; const unsigned int bid = blockIdx.x; const unsigned int bdim = blockDim.x; const unsigned int gdim = gridDim.x; int step=bdim*gdim; int num=ixmax*iymax; int id_x, id_xm, id_xp, id_y, id_ym, id_yp; const float dx=(XMAX-XMIN)/(float)IMAX; const float dy=(YMAX-YMIN)/(float)JMAX; const float cvx = V00 * V00 * DT * DT/dx/dx; const float cvy = V00 * V00 * DT * DT/dy/dy; for (int id=bid * bdim + tid;id<num;id+=step) { id_x = id % ixmax; id_y = id/ixmax; if(id_x == 0) id_xm = id-1+ixmax; else id_xm = id - 1; if(id_x == ixmax-1) id_xp = id - ixmax; else id_xp = id + 1; if(id_y == 0) id_ym = id - ixmax + iymax*ixmax; else id_ym = id - ixmax; if(id_y == iymax-1) id_yp = id + ixmax - iymax*ixmax; else id_yp = id + ixmax;#if BND_MODE==1 /* Dirichlet */ if(id_x==0 || id_x==ixmax-1 || id_y==0 || id_y==iymax-1) d_u[id] = 0.0; else#endif /* BND_MODE */ d_u[id] = 2.0 * d_u1[id] - d_u[id] + cvx * (d_u1[id_xm] - 2.0 * d_u1[id] + d_u1[id_xp]) + cvy * (d_u1[id_ym] - 2.0 * d_u1[id] + d_u1[id_yp]); }}難点:
updates: