CudaErrorFunctions.cu
/*
libcudann
Copyright (C) 2011 Luca Donati (lucadonati85@gmail.com)
*/
/*
* CudaErrorFunctions.cu
*
* Created on: Jan 10, 2011
* Author: donati
*/
#include "CudaErrorFunctions.cuh"
#include <stdlib.h>
#include <stdio.h>
#include <cublas.h>
#define BLOCKSIZE 512
#define WARP_SIZE 32
__global__ void error(float * c, const float * a, const float * b, const int number, const int actFunc, const int errorFunc){
//global thread index
const int g_tid = BLOCKSIZE * blockIdx.x + threadIdx.x;
if(g_tid<number){
float error=(a[g_tid]-b[g_tid])/spanS(actFunc);
c[g_tid]=calcErr(error,errorFunc);
}
}
//computes the error function for (number) elements of (a)-(b) and store the results in (c)
void computeError(float * c, const float * a, const float * b, const int number, const int actFunc, const int errorFunc){
int numBlocks = number/BLOCKSIZE+1;
error<<<numBlocks, BLOCKSIZE>>>(c,a,b,number,actFunc,errorFunc);
}
//computes the total mse for (number) elements of (desired)-(neurons)
float mseError(const float * desired, float * neurons, const int number, const int actFunc){
int numBlocks = number/BLOCKSIZE+1;
error<<<numBlocks, BLOCKSIZE>>>(neurons,desired,neurons,number,actFunc,ERROR_LINEAR);
//does the product of each member then sums them all and divides for number
return cublasSdot(number, neurons, 1, neurons, 1)/(float)number;
}
__global__ void maxes(const int nOfInst, const int nOfOut, const float * neurons, int * indexes){
extern __shared__ float sdata[];
float *sidx = sdata + BLOCKSIZE;
const int tid = threadIdx.x; // thread index
const int thread_lane = tid & (WARP_SIZE-1); // thread index within the warp
const int g_tid = BLOCKSIZE * blockIdx.x + tid; // global thread index
const int g_warp_id = g_tid / WARP_SIZE; // global warp index
const int offset=g_warp_id*nOfOut;
//loading in shared data of values
sdata[tid]=((offset+thread_lane<nOfInst*nOfOut)&&(thread_lane<nOfOut))?neurons[offset+thread_lane]:0.0f;
//loading in shared data of indexes
sidx[tid]=thread_lane;
if(g_warp_id<nOfInst){
//some sequential reduction (suggested to maximize the throughput)
for(unsigned int i = thread_lane+WARP_SIZE ; i < nOfOut ; i += WARP_SIZE){
float aux=neurons[offset+i];
if(sdata[tid] < aux){sdata[tid]=aux;sidx[tid]=i;}
}
//parallel reduction of both the value and the index
if (thread_lane < 16){
if(sdata[tid] < sdata[tid+16]){sdata[tid]=sdata[tid+16];sidx[tid]=sidx[tid+16];}
if(sdata[tid] < sdata[tid+8]){sdata[tid]=sdata[tid+8];sidx[tid]=sidx[tid+8];}
if(sdata[tid] < sdata[tid+4]){sdata[tid]=sdata[tid+4];sidx[tid]=sidx[tid+4];}
if(sdata[tid] < sdata[tid+2]){sdata[tid]=sdata[tid+2];sidx[tid]=sidx[tid+2];}
if(sdata[tid] < sdata[tid+1]){sdata[tid]=sdata[tid+1];sidx[tid]=sidx[tid+1];}
}
//return the best neuron index
if (thread_lane == 0){
indexes[g_warp_id]=sidx[tid];
}
}
}
//find the (indexes) of the max values of each row of a set of (neurons), divided in rows(nOfOut) and columns(nOfInst)
void computeMaxes(const int nOfInst, const int nOfOut, const float * neurons, int * indexes){
int numBlocks = nOfInst / (BLOCKSIZE/WARP_SIZE)+1;
int smemSize = 2 * BLOCKSIZE * sizeof(float);
maxes<<<numBlocks, BLOCKSIZE,smemSize>>>(nOfInst, nOfOut, neurons, indexes);
}
__global__ void addMom(float * weights, float * oldWeights,const int number, const float momentum){
//global thread index
const int g_tid = BLOCKSIZE * blockIdx.x + threadIdx.x;
const float weight=weights[g_tid];
if(g_tid<number){
weights[g_tid]+=momentum*(weight-oldWeights[g_tid]);
oldWeights[g_tid]=weight;
}
}
//adds to (number) elements of (weights) the difference between (weights) and (oldWeights) multiplied with (momentum). also update (oldWeights)
void addMomentum(float * weights, float * oldWeights,const int number, const float momentum){
int numBlocks = number/BLOCKSIZE+1;
addMom<<<numBlocks, BLOCKSIZE>>>(weights,oldWeights,number,momentum);
}
__global__ void trMatrix(const int x, const int y, const float * in, float * out){
//global thread index
const int g_tid = BLOCKSIZE * blockIdx.x + threadIdx.x;
if(g_tid<x*y){
out[g_tid%x*y+g_tid/x]=in[g_tid];
}
}
//translate a matrix x-y (rows large (x) and columns high (y)) to one y-x
void translateMatrix(const int x, const int y, const float * in, float * out){
int numBlocks = (x*y)/BLOCKSIZE+1;
trMatrix<<<numBlocks, BLOCKSIZE>>>(x,y,in,out);
}