gcd.cu:
#include "gcd.cpp"
extern "C" {
__device__
para::para(short x, short y) : x(x), y(y) {}
__device__
void euklid(short const& A, short const& B, para * result_ptr){
short ax[] = {1, 0}, ay[] = {0, 1}, aC[] = {A, B}, i = 0, div;
while(aC[i^1]!=0){
div = aC[i]/aC[i^1];
ax[i] = ax[i] - ax[i^1]*div;
ay[i] = ay[i] - ay[i^1]*div;
aC[i] = aC[i] - aC[i^1]*div;
i ^= 1;
}
*result_ptr = para(ax[i], ay[i]);
}
__global__
void compute_euklid(int n, int m, para * result){
int idx = blockIdx.x*PAR*PAR + threadIdx.x*PAR + threadIdx.y;
if(idx<n*m){
euklid(idx/m + 1, idx%m + 1, result + idx);
}
}
}
gcd.cpp:
#include<cstdio>
#include<algorithm>
using namespace std;
#include "cuda.h"
#define PAR 32
#define MAX_N 20000
struct para {
para() {}
para(short x, short y);
short x, y;
};
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction cuFunction;
CUdeviceptr d_result;
para * h_result;
CUresult res;
void init(){
cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&cuContext, 0, cuDevice);
cuModuleLoad(&cuModule, "gcd.ptx");
cuModuleGetFunction(&cuFunction, cuModule, "compute_euklid");
}
void aloc(int n, int m){
cuMemAlloc(&d_result, n*m*sizeof(para));
cuMemAllocHost((void **)&h_result, n*m*sizeof(para));
}
void bye(){
cuCtxDestroy(cuContext);
}
void * args[3];
int main(){
init();
int n, m;
scanf("%d%d", &n, &m);
aloc(n, m);
args[0] = &n;
args[1] = &m;
args[2] = &d_result;
res = cuLaunchKernel(cuFunction, (n*m + PAR*PAR - 1)/PAR/PAR, 1, 1, PAR, PAR, 1, 0, 0, args, 0);
cuMemcpyDtoH(h_result, d_result, n*m*sizeof(para));
for(int i=0; i<n; ++i)
for(int j=0; j<m; ++j){
int idx = i*m + j;
printf("%d %d %d %d %d\n",
i + 1,
j + 1,
h_result[idx].x,
h_result[idx].y,
(i+1) * h_result[idx].x + (j+1) * h_result[idx].y);
}
bye();
return 0;
}