reverse.h:
int * reverse(int * arr, int len);
reverse.cpp:
#include "reverse.h"
#include "cuda.h"
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction cuFunction;
CUresult res;
void init(){
cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&cuContext, 0, cuDevice);
cuModuleLoad(&cuModule, "reverse.ptx");
cuModuleGetFunction(&cuFunction, cuModule, "reverse");
}
void bye(){
cuCtxDestroy(cuContext);
}
int * reverse(int * arr, int len) {
CUdeviceptr d_arr;
init();
if (len < 1024 * 1024) {
int * result = new int[len];
for (int i = 0; i < len; ++i) {
result[i] = arr[len - 1 - i];
}
return result;
}
void * result;
cuMemHostRegister(arr, len * sizeof(int), CU_MEMHOSTREGISTER_DEVICEMAP);
cuMemAllocHost(&result, len * sizeof(int));
cuMemHostGetDevicePointer(&d_arr, arr, 0);
void * args[3] = {&d_arr, &len, &result};
cuLaunchKernel(cuFunction, 1024, 1, 1, 1024, 1, 1, 0, 0, args, 0);
cuCtxSynchronize();
bye();
return (int*) result;
}
reverse.cu:
#include "reverse.h"
#include "cuda.h"
#include <algorithm>
#include <cstdio>
extern "C"{
__global__
void reverse(int * arr, int len, int * result) {
int id = 1024 * blockIdx.x + threadIdx.x;
while (id < len) {
result[len - 1 - id] = arr[id];
id += 1024 * 1024;
}
}
}