CUDA TEST01 (wave2d)

手持ちの計算機の速度向上のために，とうとうCUDAに手を出してしまいました．

アーキテクチャ固有(多数の core と小さなローカルメモリ－)のプログラムを書くのに抵抗があったのですが，良く考えたら昔も FACOM VP-200 用や Intel PARAGON 用プログラムとかを書いてました．イマドキの超並列に挑戦．

machine spec.

bandwidthTest:

Device 0: GeForce GT 240

 Quick Mode

 Host to Device Bandwidth, 1 Device(s), Paged memory

   Transfer Size (Bytes)    Bandwidth(MB/s)

   33554432            1744.5

 Device to Host Bandwidth, 1 Device(s), Paged memory

   Transfer Size (Bytes)    Bandwidth(MB/s)

   33554432            1210.4

 Device to Device Bandwidth, 1 Device(s)

   Transfer Size (Bytes)    Bandwidth(MB/s)

   33554432            8242.4

2次元波動方程式 (explicit, LeapFrog) + OpenGLコードで速度をテスト
1. もともとは OpenCampus のデモ用に MPICH で並列計算+並列描画プログラムとして作成しました．
2. results:
  1. ＊理論上は，速度的には constant > global >> No CUDA のはずなのですが？たいしてチューンナップしなくても3倍になったから良しと。。。
3. GPUでの部分は定石どおり

// CUDA

#ifdef USE_CUDA_CONSTANT_MEMORY

__constant__ float d_u1[IMAX*JMAX];

__global__ void wave2dKernel( float* d_u, int ixmax, int iymax)

#else

__global__ void wave2dKernel( float* d_u, float* d_u1, int ixmax, int iymax)

#endif /* USE_CUDA_CONSTANT_MEMORY */

    const unsigned int tid = threadIdx.x;

    const unsigned int bid = blockIdx.x;

    const unsigned int bdim = blockDim.x;

    const unsigned int gdim = gridDim.x;

    int step=bdim*gdim;

    int num=ixmax*iymax;

    int id_x, id_xm, id_xp, id_y, id_ym, id_yp;

    const float dx=(XMAX-XMIN)/(float)IMAX;

    const float dy=(YMAX-YMIN)/(float)JMAX;

    const float cvx = V00 * V00 * DT * DT/dx/dx;

    const float cvy = V00 * V00 * DT * DT/dy/dy;

    for (int id=bid * bdim + tid;id<num;id+=step)

      id_x = id % ixmax;

      id_y = id/ixmax;

      if(id_x == 0)

        id_xm = id-1+ixmax;

      else

        id_xm = id - 1;

      if(id_x == ixmax-1)

        id_xp = id - ixmax;

      else

        id_xp = id + 1;

      if(id_y == 0)

        id_ym = id - ixmax + iymax*ixmax;

      else

        id_ym = id - ixmax;

      if(id_y == iymax-1)

        id_yp = id + ixmax - iymax*ixmax;

      else

        id_yp = id + ixmax;

#if BND_MODE==1

    /* Dirichlet */

         if(id_x==0 || id_x==ixmax-1 || id_y==0 || id_y==iymax-1)

           d_u[id] = 0.0;

     else

#endif /* BND_MODE */

      d_u[id] = 2.0 * d_u1[id] - d_u[id]

           + cvx * (d_u1[id_xm] - 2.0 * d_u1[id] + d_u1[id_xp])

           + cvy * (d_u1[id_ym] - 2.0 * d_u1[id] + d_u1[id_yp]);

難点:

updates:

Google Sites

Report abuse