Numba

from numba import cuda

 Move numpy array to GPU ahead of time

gpu_array = cuda.to_device(np.random.random((30_000_000)))

 Copy back to CPU

cpu_array = gpu_array.copy_to_host()


from numba import cuda


# Define a kernel that is compiled for CUDA

@cuda.jit

def vector_add(r, x, y):

    start = cuda.grid(1)

    step = cuda.gridsize(1)

    stop = len(r)

    for i in range(start, stop, step):

        r[i] = x[i] + y[i]


# Allocate some arrays on the device and copy data

N = 2 * 10

x = cuda.to_device(np.arange(N))

y = cuda.to_device(np.arange(N) * 2)

r = cuda.device_array_like(x)


# Configure and launch kernel

block_dim = 256

grid_dim = (len(x) // block_dim) + 1

vector_add[grid_dim, block_dim](r, x, y)


# Copy result back from the device

result = r.copy_to_host()