FeedForwardNNTrainer.cpp
/*
libcudann
Copyright (C) 2011 Luca Donati (lucadonati85@gmail.com)
*/
/*
* FeedForwardNNTrainer.cpp
*
* Created on: 19/nov/2010
* Author: donati
*/
#include "FeedForwardNNTrainer.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <float.h>
#include <signal.h>
#include <cublas.h>
#include <cutil_inline.h>
//SIGINT handler
bool quit=false;
void terminate(int)
{
quit=true;
}
FeedForwardNNTrainer::FeedForwardNNTrainer() {
struct timeval tv;
gettimeofday(&tv, NULL);
srand(tv.tv_usec);
trainingSet = NULL;
testSet = NULL;
net = NULL;
bestMSETestNet = NULL;
bestMSETrainTestNet = NULL;
bestClassTestNet = NULL;
}
FeedForwardNNTrainer::~FeedForwardNNTrainer() {
}
//choose a net to operate on and save after the training
void FeedForwardNNTrainer::selectNet(FeedForwardNN & n)
{
net = &n;
}
//choose the training set
void FeedForwardNNTrainer::selectTrainingSet(LearningSet & s)
{
trainingSet = &s;
}
//choose the test set. if this is set the error rate is computed on test set instead of training set
void FeedForwardNNTrainer::selectTestSet(LearningSet & s)
{
testSet = &s;
}
//choose a net to save the best network trained so far after each epoch. mse on test set is the criterion
void FeedForwardNNTrainer::selectBestMSETestNet(FeedForwardNN & n)
{
bestMSETestNet = &n;
}
//choose a net to save the best network trained so far after each epoch. mse on train set + mse on test set is the criterion
void FeedForwardNNTrainer::selectBestMSETrainTestNet(FeedForwardNN & n)
{
bestMSETrainTestNet = &n;
}
//choose a net to save the best network trained so far after each epoch. percentage as classifier is the criterion
void FeedForwardNNTrainer::selectBestClassTestNet(FeedForwardNN & n)
{
bestClassTestNet = &n;
}
//starts the training using params. n is the number of parameters
//the first 2 elements of params are where the training will be executed (TRAIN_CPU,TRAIN_GPU)
//and the training algorithm (ALG_BP,ALG_BATCH...). the other parameters are algorithm dependent
//returns the best MSE on test set (or train set if test set isn't specified)
//printtype specifies how much verbose will be the execution (PRINT_ALL,PRINT_MIN,PRINT_OFF)
float FeedForwardNNTrainer::train(const int n, const float * params, const int printtype){
//checks CTRL-C to interrupt training manually
quit=false;
signal(SIGINT,terminate);
setvbuf(stdout,(char*)NULL,_IONBF,0);
// checks for network and training set correct initialization
if(net==NULL){printf("NEURAL NETWORK NOT SELECTED\n");exit(1);}
if(trainingSet==NULL){printf("TRAINING SET NOT SELECTED\n");exit(1);}
if((trainingSet->getNumOfInputsPerInstance()!=net->getLayersSize()[0])
||(trainingSet->getNumOfOutputsPerInstance()!=net->getLayersSize()[net->getNumOfLayers()-1])){
printf("NETWORK AND TRAINING SET OF DIFFERENT SIZE\n");exit(1);
}
if(testSet!=NULL&&
(trainingSet->getNumOfInputsPerInstance()!=testSet->getNumOfInputsPerInstance()
||trainingSet->getNumOfOutputsPerInstance()!=testSet->getNumOfOutputsPerInstance())){
printf("TEST SET OF DIFFERENT SIZE\n");exit(1);}
if(n<1){printf("TOO FEW PARAMETERS SELECTED FOR TRAINING\n");exit(1);}
if(printtype!=PRINT_OFF){
printf("Network:\t\t");
printf("%d",net->getLayersSize()[0]);
for(int i=1;i<net->getNumOfLayers();i++)
printf("x%d",net->getLayersSize()[i]);
printf("\n");
printf("Activation functions:\t");
for(int i=0;i<net->getNumOfLayers();i++)
printf("%d ",net->getActFuncts()[i]);
printf("\n");
}
//select the right algorithm to execute training
switch((int)params[0]){
case TRAIN_CPU:
switch((int)params[1]){
case ALG_BP: return trainCpuBp(n-2, params+2,printtype); break;
case ALG_BATCH: return trainCpuBatch(n-2, params+2,printtype); break;
default: printf("TRAINING NOT IMPLEMENTED YET\n");exit(1); break;
}
break;
case TRAIN_GPU:
switch((int)params[1]){
case ALG_BP: printf("TRAINING NOT IMPLEMENTED YET\n");exit(1); break;
case ALG_BATCH: return trainGPUBatch(n-2, params+2,printtype); break;
default: printf("TRAINING NOT IMPLEMENTED YET\n");exit(1); break;
}
break;
default:printf("TRAINING NOT IMPLEMENTED YET\n");exit(1);break;
}
//stops checking CTRL-C
signal(SIGINT,SIG_DFL);
}
//backpropagation training on host
//n is the number of parameters. parameters are (float array):
//desired error, max_epochs, epochs_between_reports, learning_rate, momentum (using momentum is 20% slower), shuffle (SHUFFLE_ON or SHUFFLE_OFF), error function (ERROR_TANH or ERROR_LINEAR)
float FeedForwardNNTrainer::trainCpuBp(const int n, const float * params, const int printtype){
//parameters parsing
float desired_error;
int max_epochs;
int epochs_between_reports;
float learningRate;
float momentum;
int shuff;
int errorFunc;
if(n<2){printf("TOO FEW PARAMETERS SELECTED FOR TRAINING\n");exit(1);}
desired_error=params[0];
max_epochs=params[1];
if(n>=3)
epochs_between_reports=params[2];
else
epochs_between_reports=max_epochs/10;
if(n>=4)
learningRate=params[3];
else
learningRate=0.7;
if(n>=5)
momentum=params[4];
else
momentum=0;
if(n>=6)
shuff=params[5];
else
shuff=SHUFFLE_ON;
if(n>=7)
errorFunc=params[6];
else
errorFunc=ERROR_TANH;
if(printtype!=PRINT_OFF){
printf("Training on:\t\tCPU\n");
printf("Algorithm:\t\tBackpropagation\n");
printf("Desired Error:\t\t%f\n",desired_error);
printf("Max epochs:\t\t%d\n",max_epochs);
printf("Epochs between reports:\t%d\n",epochs_between_reports);
printf("Learning rate:\t\t%f\n",learningRate);
printf("Momentum:\t\t%f\n",momentum);
if(shuff==SHUFFLE_ON)
printf("Shuffle:\t\tON\n");
else
printf("Shuffle:\t\tOFF\n");
if(errorFunc==ERROR_TANH)
printf("Error function:\t\tTANH\n");
else
printf("Error function:\t\tLINEAR\n");
printf("\n");
}
float mseTrain=FLT_MAX,mseTest=FLT_MAX;
//declare some error values for evaluating the trained network and storing best results
//best net MSE on test, best net MSE on train+test, best net as classifier on test
float bestMSETest=FLT_MAX,bestMSETrainTest=FLT_MAX,bestClassTest=0;
//declare some network values
int numOfLayers=net->getNumOfLayers();
int numOfWeights=net->getNumOfWeights();
int numOfNeurons=0;
for(int i=0;i<net->getNumOfLayers();i++){
numOfNeurons+=net->getLayersSize()[i]+1;
}
//declare an array of neurons to represent the neuron values
float * values = new float[numOfNeurons];
//declare an array of deltas to represent the gradients for the weight updates
float * deltas = new float[numOfNeurons];
//declare an array of weights to use for momentum
float * oldWeights = new float[numOfWeights];
//declare an array of temporary weights to use for batch and similar methods
float * tmpWeights = new float[numOfWeights];
//declare a pointer to the net weights
float * weights;
weights=net->getWeights();
//declare a pointer to the net activation functions
int * actFuncts;
actFuncts=net->getActFuncts();
//declare a pointer to the net layers size
int * layersSize;
layersSize=net->getLayersSize();
//declare some offsets to manage array indexes of each layer 'i'
int offsetWeights[numOfLayers];
int offsetIns[numOfLayers];
int offsetOuts[numOfLayers];
int offsetDeltas[numOfLayers];
for(int i=0;i<numOfLayers;i++){
//calculates the offsets of the arrays
offsetWeights[i] = 0;
offsetDeltas[i] = layersSize[0]+1;
offsetIns[i] = 0;
offsetOuts[i] = layersSize[0]+1;
for(int j=0;j<i;j++){
offsetWeights[i]+=(layersSize[j]+1)*layersSize[j+1];
offsetIns[i]+=layersSize[j]+1;
offsetOuts[i]+=layersSize[j+1]+1;
offsetDeltas[i]+=layersSize[j+1]+1;
}
}
//save previous weights to use in momentum calculation
for(int w=0;w<numOfWeights;w++)
oldWeights[w]=weights[w];
//declare some training set values
int numOfInstances=trainingSet->getNumOfInstances();
int numOfInputsPerInstance=trainingSet->getNumOfInputsPerInstance();
int numOfOutputsPerInstance=trainingSet->getNumOfOutputsPerInstance();
//declare a pointer to the training set inputs
float * trainingSetInputs;
trainingSetInputs=trainingSet->getInputs();
//declare a pointer to the training set outputs
float * trainingSetOutputs;
trainingSetOutputs=trainingSet->getOutputs();
//vector to shuffle training set
int order[numOfInstances];
for(int i=0;i<numOfInstances;i++)
order[i]=i;
if(printtype==PRINT_ALL){
//compute starting error rates
printf("Starting:\tError on train set %.10f",net->computeMSE(*trainingSet));
if(testSet!=NULL){
printf("\t\tError on test set %.10f",net->computeMSE(*testSet));
}
printf("\n");
}
//epochs training
for(int epoch=1;epoch<=max_epochs&&quit==false;epoch++){
//shuffle instances
int ind=0,aux=0;
if(shuff==SHUFFLE_ON)
for(int i=0;i<numOfInstances;i++){
ind=(rand()%(numOfInstances-i))+i;
aux=order[ind];
order[ind]=order[i];
order[i]=aux;
}
//instances training
for(int instance=0;instance<numOfInstances;instance++){
//computes a single instance forward of the backpropagation training
stepForward(values,weights,actFuncts,numOfLayers,layersSize,numOfInputsPerInstance,trainingSetInputs,offsetIns,offsetWeights,offsetOuts,order,instance);
//computes a single instance backward of the backpropagation training
stepBack(values,weights,deltas,actFuncts,numOfLayers,layersSize,numOfOutputsPerInstance,trainingSetOutputs,offsetWeights,offsetDeltas,offsetOuts,order,instance,errorFunc);
//update the weights using the deltas
weightsUpdate(values,weights,weights,deltas,numOfLayers,layersSize,offsetIns,offsetWeights,offsetDeltas,momentum,oldWeights,learningRate);
}
if(epochs_between_reports>0&&epoch%epochs_between_reports==0){
mseTrain=net->computeMSE(*trainingSet);
if(printtype==PRINT_ALL)
printf("Epoch\t%d\tError on train set %.10f",epoch,mseTrain);
if(testSet!=NULL){
mseTest=net->computeMSE(*testSet);
if(mseTest<bestMSETest){
bestMSETest=mseTest;
if(bestMSETestNet!=NULL){
*bestMSETestNet=*net;
}
}
if((mseTrain+mseTest)<bestMSETrainTest&&bestMSETrainTestNet!=NULL){
*bestMSETrainTestNet=*net;
bestMSETrainTest=mseTrain+mseTest;
}
if(printtype==PRINT_ALL)
printf("\t\tError on test set %.10f",mseTest);
if(bestClassTestNet!=NULL){
float per=net->classificatePerc(*testSet);
if(printtype==PRINT_ALL)
printf("\t\tClassification percentage on test set: %.1f%%",per*100);
if(per>bestClassTest){
*bestClassTestNet=*net;
bestClassTest=per;
if(printtype==PRINT_ALL)
printf(" ***");
}
}
if(mseTest<=desired_error){
if(printtype==PRINT_ALL)
printf("\nDesired error reached on test set.\n");
break;
}
}
if(printtype==PRINT_ALL)
printf("\n");
if(mseTrain<=desired_error&&testSet==NULL){
if(printtype==PRINT_ALL)
printf("Desired error reached on training set.\n");
break;
}
}
}
delete [] values;
delete [] deltas;
delete [] oldWeights;
delete [] tmpWeights;
if(printtype==PRINT_ALL)
printf("Training complete.\n");
if(testSet!=NULL){
return bestMSETest;
}
else return mseTrain;
}
//batch training on host
//n is the number of parameters. parameters are (float array):
//desired error, max_epochs, epochs_between_reports, learning_rate, momentum (using momentum is 20% slower), shuffle (SHUFFLE_ON or SHUFFLE_OFF), error function (ERROR_TANH or ERROR_LINEAR)void FeedForwardNNTrainer::trainCpuBatch(const int n, const float * params){
float FeedForwardNNTrainer::trainCpuBatch(const int n, const float * params, const int printtype){
//parameters parsing
float desired_error;
int max_epochs;
int epochs_between_reports;
float learningRate;
float momentum;
int shuff;
int errorFunc;
if(n<2){printf("TOO FEW PARAMETERS SELECTED FOR TRAINING\n");exit(1);}
desired_error=params[0];
max_epochs=params[1];
if(n>=3)
epochs_between_reports=params[2];
else
epochs_between_reports=max_epochs/10;
if(n>=4)
learningRate=params[3];
else
learningRate=0.7;
if(n>=5)
momentum=params[4];
else
momentum=0;
if(n>=6)
shuff=params[5];
else
shuff=SHUFFLE_ON;
if(n>=7)
errorFunc=params[6];
else
errorFunc=ERROR_TANH;
if(printtype!=PRINT_OFF){
printf("Training on:\t\tCPU\n");
printf("Algorithm:\t\tBatch\n");
printf("Desired Error:\t\t%f\n",desired_error);
printf("Max epochs:\t\t%d\n",max_epochs);
printf("Epochs between reports:\t%d\n",epochs_between_reports);
printf("Learning rate:\t\t%f\n",learningRate);
printf("Momentum:\t\t%f\n",momentum);
if(shuff==SHUFFLE_ON)
printf("Shuffle:\t\tON\n");
else
printf("Shuffle:\t\tOFF\n");
if(errorFunc==ERROR_TANH)
printf("Error function:\t\tTANH\n");
else
printf("Error function:\t\tLINEAR\n");
printf("\n");
}
float mseTrain=FLT_MAX,mseTest=FLT_MAX;
//declare some error values for evaluating the trained network and storing best results
//best net MSE on test, best net MSE on train+test, best net as classifier on test
float bestMSETest=FLT_MAX,bestMSETrainTest=FLT_MAX,bestClassTest=0;
//declare some network values
int numOfLayers=net->getNumOfLayers();
int numOfWeights=net->getNumOfWeights();
int numOfNeurons=0;
for(int i=0;i<net->getNumOfLayers();i++){
numOfNeurons+=net->getLayersSize()[i]+1;
}
//declare an array of neurons to represent the neuron values
float * values = new float[numOfNeurons];
//declare an array of deltas to represent the gradients for the weight updates
float * deltas = new float[numOfNeurons];
//declare an array of weights to use for momentum
float * oldWeights = new float[numOfWeights];
//declare an array of temporary weights to use for batch and similar methods
float * tmpWeights = new float[numOfWeights];
//declare a pointer to the net weights
float * weights;
weights=net->getWeights();
//declare a pointer to the net activation functions
int * actFuncts;
actFuncts=net->getActFuncts();
//declare a pointer to the net layers size
int * layersSize;
layersSize=net->getLayersSize();
//declare some offsets to manage array indexes of each layer 'i'
int offsetWeights[numOfLayers];
int offsetIns[numOfLayers];
int offsetOuts[numOfLayers];
int offsetDeltas[numOfLayers];
for(int i=0;i<numOfLayers;i++){
//calculates the offsets of the arrays
offsetWeights[i] = 0;
offsetDeltas[i] = layersSize[0]+1;
offsetIns[i] = 0;
offsetOuts[i] = layersSize[0]+1;
for(int j=0;j<i;j++){
offsetWeights[i]+=(layersSize[j]+1)*layersSize[j+1];
offsetIns[i]+=layersSize[j]+1;
offsetOuts[i]+=layersSize[j+1]+1;
offsetDeltas[i]+=layersSize[j+1]+1;
}
}
//save previous weights to use in momentum calculation
for(int w=0;w<numOfWeights;w++)
oldWeights[w]=weights[w];
//resets temporary weights for batch
for(int w=0;w<numOfWeights;w++)
tmpWeights[w]=0;
//declare some training set values
int numOfInstances=trainingSet->getNumOfInstances();
int numOfInputsPerInstance=trainingSet->getNumOfInputsPerInstance();
int numOfOutputsPerInstance=trainingSet->getNumOfOutputsPerInstance();
//declare a pointer to the training set inputs
float * trainingSetInputs;
trainingSetInputs=trainingSet->getInputs();
//declare a pointer to the training set outputs
float * trainingSetOutputs;
trainingSetOutputs=trainingSet->getOutputs();
//vector to shuffle training set
int order[numOfInstances];
for(int i=0;i<numOfInstances;i++)
order[i]=i;
if(printtype==PRINT_ALL){
//compute starting error rates
printf("Starting:\tError on train set %.10f",net->computeMSE(*trainingSet));
if(testSet!=NULL){
printf("\t\tError on test set %.10f",net->computeMSE(*testSet));
}
printf("\n");
}
//epochs training
for(int epoch=1;epoch<=max_epochs&&quit==false;epoch++){
//shuffle instances
int ind=0,aux=0;
if(shuff==SHUFFLE_ON)
for(int i=0;i<numOfInstances;i++){
ind=(rand()%(numOfInstances-i))+i;
aux=order[ind];
order[ind]=order[i];
order[i]=aux;
}
//instances training
for(int instance=0;instance<numOfInstances;instance++){
//computes a single instance forward of the backpropagation training
stepForward(values,weights,actFuncts,numOfLayers,layersSize,numOfInputsPerInstance,trainingSetInputs,offsetIns,offsetWeights,offsetOuts,order,instance);
//computes a single instance backward of the backpropagation training
stepBack(values,weights,deltas,actFuncts,numOfLayers,layersSize,numOfOutputsPerInstance,trainingSetOutputs,offsetWeights,offsetDeltas,offsetOuts,order,instance,errorFunc);
//update the weights using the deltas
//no momentum is used, it will be added after all the instances
weightsUpdate(values,weights,tmpWeights,deltas,numOfLayers,layersSize,offsetIns,offsetWeights,offsetDeltas,0,oldWeights,learningRate);
}
//add temporary weights changes to real weights (the total is divided among the total number of instances (to use the same learning rate of the standard BP)
//it also uses momentum
for(int w=0;w<numOfWeights;w++){
float auxWeight=weights[w];
weights[w]+=(tmpWeights[w]/numOfInstances)+momentum*(auxWeight-oldWeights[w]);
tmpWeights[w]=0;
oldWeights[w]=auxWeight;
}
if(epochs_between_reports>0&&epoch%epochs_between_reports==0){
mseTrain=net->computeMSE(*trainingSet);
if(printtype==PRINT_ALL)
printf("Epoch\t%d\tError on train set %.10f",epoch,mseTrain);
if(testSet!=NULL){
mseTest=net->computeMSE(*testSet);
if(mseTest<bestMSETest){
bestMSETest=mseTest;
if(bestMSETestNet!=NULL){
*bestMSETestNet=*net;
}
}
if((mseTrain+mseTest)<bestMSETrainTest&&bestMSETrainTestNet!=NULL){
*bestMSETrainTestNet=*net;
bestMSETrainTest=mseTrain+mseTest;
}
if(printtype==PRINT_ALL)
printf("\t\tError on test set %.10f",mseTest);
if(bestClassTestNet!=NULL){
float per=net->classificatePerc(*testSet);
if(printtype==PRINT_ALL)
printf("\t\tClassification percentage on test set: %.1f%%",per*100);
if(per>bestClassTest){
*bestClassTestNet=*net;
bestClassTest=per;
if(printtype==PRINT_ALL)
printf(" ***");
}
}
if(mseTest<=desired_error){
if(printtype==PRINT_ALL)
printf("\nDesired error reached on test set.\n");
break;
}
}
if(printtype==PRINT_ALL)
printf("\n");
if(mseTrain<=desired_error&&testSet==NULL){
if(printtype==PRINT_ALL)
printf("Desired error reached on training set.\n");
break;
}
}
}
delete [] values;
delete [] deltas;
delete [] oldWeights;
delete [] tmpWeights;
if(printtype==PRINT_ALL)
printf("Training complete.\n");
if(testSet!=NULL){
return bestMSETest;
}
else return mseTrain;
}
//batch training on device
//n is the number of parameters. parameters are (float array):
//desired error, max_epochs, epochs_between_reports, learning_rate, momentum (using momentum is 20% slower), shuffle (SHUFFLE_ON or SHUFFLE_OFF), error function (ERROR_TANH or ERROR_LINEAR)void FeedForwardNNTrainer::trainCpuBatch(const int n, const float * params){
float FeedForwardNNTrainer::trainGPUBatch(const int n, const float * params, const int printtype){
//parameters parsing
float desired_error;
int max_epochs;
int epochs_between_reports;
float learningRate;
float momentum;
int shuff;
int errorFunc;
if(n<2){printf("TOO FEW PARAMETERS SELECTED FOR TRAINING\n");exit(1);}
desired_error=params[0];
max_epochs=params[1];
if(n>=3)
epochs_between_reports=params[2];
else
epochs_between_reports=max_epochs/10;
if(n>=4)
learningRate=params[3];
else
learningRate=0.7;
if(n>=5)
momentum=params[4];
else
momentum=0;
if(n>=6)
shuff=params[5];
else
shuff=SHUFFLE_ON;
if(n>=7)
errorFunc=params[6];
else
errorFunc=ERROR_TANH;
if(printtype!=PRINT_OFF){
printf("Training on:\t\tGPU\n");
printf("Algorithm:\t\tBatch\n");
printf("Desired Error:\t\t%f\n",desired_error);
printf("Max epochs:\t\t%d\n",max_epochs);
printf("Epochs between reports:\t%d\n",epochs_between_reports);
printf("Learning rate:\t\t%f\n",learningRate);
printf("Momentum:\t\t%f\n",momentum);
if(shuff==SHUFFLE_ON)
printf("Shuffle:\t\tON\n");
else
printf("Shuffle:\t\tOFF\n");
if(errorFunc==ERROR_TANH)
printf("Error function:\t\tTANH\n");
else
printf("Error function:\t\tLINEAR\n");
printf("\n");
}
float mseTrain=FLT_MAX,mseTest=FLT_MAX;
//declare some error values for evaluating the trained network and storing best results
//best net MSE on test, best net MSE on train+test, best net as classifier on test
float bestMSETest=FLT_MAX,bestMSETrainTest=FLT_MAX,bestClassTest=0;
//declare some network values
int numOfLayers=net->getNumOfLayers();
int numOfWeights=net->getNumOfWeights();
int numOfNeurons=0;
for(int i=0;i<net->getNumOfLayers();i++){
numOfNeurons+=net->getLayersSize()[i]+1;
}
//declare some training set values
int numOfInstances=trainingSet->getNumOfInstances();
int numOfInputsPerInstance=trainingSet->getNumOfInputsPerInstance();
int numOfOutputsPerInstance=trainingSet->getNumOfOutputsPerInstance();
int numOfTestInstances=0;
if(testSet!=NULL){
numOfTestInstances=testSet->getNumOfInstances();
}
//declare an array of neurons to represent the neuron values
float * values = new float[numOfNeurons*numOfInstances];
//declare an array of neurons to represent the neuron values of the test set
float * testValues = new float[numOfNeurons*numOfTestInstances];
//declare an array of deltas to represent the gradients for the weight updates
float * deltas = new float[numOfNeurons*numOfInstances];
//training and test set to be uploaded in device memory
float * columnTrainingSetInputs = new float[numOfInstances*numOfInputsPerInstance];
float * columnTrainingSetOutputs = new float[numOfInstances*numOfOutputsPerInstance];
float * columnTestSetInputs = new float[numOfTestInstances*numOfInputsPerInstance];
float * columnTestSetOutputs = new float[numOfTestInstances*numOfOutputsPerInstance];
//declare an array of weights to use for momentum
float * oldWeights = new float[numOfWeights];
//declare a pointer to the net weights
float * weights;
weights=net->getWeights();
//declare a pointer to the net activation functions
int * actFuncts;
actFuncts=net->getActFuncts();
//declare a pointer to the net layers size
int * layersSize;
layersSize=net->getLayersSize();
//declare a pointer to the training set inputs
float * trainingSetInputs;
//declare a pointer to the training set outputs
float * trainingSetOutputs;
trainingSetInputs=trainingSet->getInputs();
trainingSetOutputs=trainingSet->getOutputs();
//declare a pointer to the test set inputs
float * testSetInputs=NULL;
//declare a pointer to the test set outputs
float * testSetOutputs=NULL;
if(testSet!=NULL){
testSetInputs=testSet->getInputs();
testSetOutputs=testSet->getOutputs();
}
//declare some offsets to manage array indexes of each layer 'i'
int offsetWeights[numOfLayers];
int offsetIns[numOfLayers];
int offsetOuts[numOfLayers];
int offsetDeltas[numOfLayers];
int offsetTestIns[numOfLayers];
int offsetTestOuts[numOfLayers];
for(int i=0;i<numOfLayers;i++){
//calculates the offsets of the arrays
offsetWeights[i] = 0;
offsetDeltas[i] = (layersSize[0]+1)*numOfInstances;
offsetIns[i] = 0;
offsetOuts[i] = (layersSize[0]+1)*numOfInstances;
offsetTestIns[i] = 0;
offsetTestOuts[i] = (layersSize[0]+1)*numOfTestInstances;
for(int j=0;j<i;j++){
offsetWeights[i]+=(layersSize[j]+1)*layersSize[j+1];
offsetIns[i]+=(layersSize[j]+1)*numOfInstances;
offsetOuts[i]+=(layersSize[j+1]+1)*numOfInstances;
offsetDeltas[i]+=(layersSize[j+1]+1)*numOfInstances;
offsetTestIns[i]+=(layersSize[j]+1)*numOfTestInstances;
offsetTestOuts[i]+=(layersSize[j+1]+1)*numOfTestInstances;
}
}
//resets values and deltas
for(int i=0;i<numOfNeurons*numOfInstances;i++)values[i]=0.0f;
for(int i=0;i<numOfNeurons*numOfTestInstances;i++)testValues[i]=0.0f;
for(int i=0;i<numOfNeurons*numOfInstances;i++)deltas[i]=0.0f;
//row-major->column major indexing
for(int i=0;i<numOfInstances;i++){
for(int j=0;j<numOfInputsPerInstance;j++)
columnTrainingSetInputs[j*numOfInstances+i]=trainingSetInputs[i*numOfInputsPerInstance+j];
for(int j=0;j<numOfOutputsPerInstance;j++)
columnTrainingSetOutputs[j*numOfInstances+i]=trainingSetOutputs[i*numOfOutputsPerInstance+j];
}
for(int i=0;i<numOfTestInstances;i++){
for(int j=0;j<numOfInputsPerInstance;j++)
columnTestSetInputs[j*numOfTestInstances+i]=testSetInputs[i*numOfInputsPerInstance+j];
for(int j=0;j<numOfOutputsPerInstance;j++)
columnTestSetOutputs[j*numOfTestInstances+i]=testSetOutputs[i*numOfOutputsPerInstance+j];
}
//copy the training set into the input neurons values
for(int i=0;i<numOfInstances*numOfInputsPerInstance;i++)
values[i]=columnTrainingSetInputs[i];
//copy the test set into the input neurons values
for(int i=0;i<numOfTestInstances*numOfInputsPerInstance;i++)
testValues[i]=columnTestSetInputs[i];
//BIAS initializations
for(int i=0;i<numOfLayers;i++){
for(int j=offsetIns[i]+(layersSize[i])*numOfInstances;j<offsetOuts[i];j++)
values[j]=1.0f;
}
if(testSet!=NULL)
for(int i=0;i<numOfLayers;i++){
for(int j=offsetTestIns[i]+(layersSize[i])*numOfTestInstances;j<offsetTestOuts[i];j++)
testValues[j]=1.0f;
}
//vector to shuffle training set
int order[numOfInstances];
for(int i=0;i<numOfInstances;i++)
order[i]=i;
//cublas initializations
cublasStatus stat;
cublasInit();
float * devValues=NULL;
float * devTestValues=NULL;
float * devDeltas=NULL;
float * devWeights=NULL;
float * devOldWeights=NULL;
float * devTrainingSetInputs=NULL;
float * devTrainingSetOutputs=NULL;
float * devTestSetInputs=NULL;
float * devTestSetOutputs=NULL;
//allocates the vectors on the device
stat = cublasAlloc (numOfNeurons*numOfInstances, sizeof(*values), (void**)&devValues);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
if(testSet!=NULL){
stat = cublasAlloc (numOfNeurons*numOfTestInstances, sizeof(*testValues), (void**)&devTestValues);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
}
stat = cublasAlloc (numOfNeurons*numOfInstances, sizeof(*deltas), (void**)&devDeltas);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
stat = cublasAlloc (numOfWeights, sizeof(*weights), (void**)&devWeights);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
stat = cublasAlloc (numOfWeights, sizeof(*oldWeights), (void**)&devOldWeights);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
stat = cublasAlloc (numOfInstances*numOfInputsPerInstance, sizeof(*devTrainingSetInputs), (void**)&devTrainingSetInputs);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
stat = cublasAlloc (numOfInstances*numOfOutputsPerInstance, sizeof(*devTrainingSetOutputs), (void**)&devTrainingSetOutputs);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
if(testSet!=NULL){
stat = cublasAlloc (numOfTestInstances*numOfInputsPerInstance, sizeof(*devTestSetInputs), (void**)&devTestSetInputs);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
stat = cublasAlloc (numOfTestInstances*numOfOutputsPerInstance, sizeof(*devTestSetOutputs), (void**)&devTestSetOutputs);
if(stat != CUBLAS_STATUS_SUCCESS){printf ("device memory allocation failed\n");exit(1);}
}
//copies the training set inputs and outputs on the device
cudaMemcpy(devTrainingSetInputs, columnTrainingSetInputs, numOfInstances*numOfInputsPerInstance*sizeof(*columnTrainingSetInputs), cudaMemcpyHostToDevice);
cudaMemcpy(devTrainingSetOutputs, columnTrainingSetOutputs, numOfInstances*numOfOutputsPerInstance*sizeof(*columnTrainingSetOutputs), cudaMemcpyHostToDevice);
if(testSet!=NULL){
//copies the test set inputs and outputs on the device
cudaMemcpy(devTestSetInputs, columnTestSetInputs, numOfTestInstances*numOfInputsPerInstance*sizeof(*columnTestSetInputs), cudaMemcpyHostToDevice);
cudaMemcpy(devTestSetOutputs, columnTestSetOutputs, numOfTestInstances*numOfOutputsPerInstance*sizeof(*columnTestSetOutputs), cudaMemcpyHostToDevice);
}
//copies the training set inputs with the biases and the weights to the device
cudaMemcpy(devValues, values, numOfNeurons*numOfInstances*sizeof(*values), cudaMemcpyHostToDevice);
if(testSet!=NULL){
//copies the test set inputs with the biases and the weights to the device
cudaMemcpy(devTestValues, testValues, numOfNeurons*numOfTestInstances*sizeof(*testValues), cudaMemcpyHostToDevice);
}
cudaMemcpy(devDeltas, deltas, numOfNeurons*numOfInstances*sizeof(*deltas), cudaMemcpyHostToDevice);
//weights are allocated row-major
cudaMemcpy(devWeights, weights, numOfWeights*sizeof(*weights), cudaMemcpyHostToDevice);
cudaMemcpy(devOldWeights, weights, numOfWeights*sizeof(*weights), cudaMemcpyHostToDevice);
if(printtype==PRINT_ALL){
//compute starting error rates (GPU)
printf("Starting:\tError on train set %.10f",GPUComputeMSE(devValues,devWeights,actFuncts,numOfLayers,layersSize,numOfInstances,numOfOutputsPerInstance,devTrainingSetOutputs,offsetIns,offsetWeights,offsetOuts));
if(testSet!=NULL){
printf("\t\tError on test set %.10f",GPUComputeMSE(devTestValues,devWeights,actFuncts,numOfLayers,layersSize,numOfTestInstances,numOfOutputsPerInstance,devTestSetOutputs,offsetTestIns,offsetWeights,offsetTestOuts));
}
printf("\n");
}
//epochs training
for(int epoch=1;epoch<=max_epochs&&quit==false;epoch++){
//shuffle instances
int ind=0,aux=0;
if(shuff==SHUFFLE_ON)
for(int i=0;i<numOfInstances;i++){
ind=(rand()%(numOfInstances-i))+i;
aux=order[ind];
order[ind]=order[i];
order[i]=aux;
}
//training
//computes all the instances forward of the backpropagation training
GPUForward(devValues,devWeights,actFuncts,numOfLayers,layersSize,numOfInstances,offsetIns,offsetWeights,offsetOuts);
//computes all the instances backward of the backpropagation training
GPUBack(devValues,devWeights,devDeltas,actFuncts,numOfLayers,layersSize,numOfInstances,numOfOutputsPerInstance,devTrainingSetOutputs,offsetWeights,offsetDeltas,offsetOuts,errorFunc);
//update the weights using the deltas
GPUUpdate(devValues,devWeights,devDeltas,numOfLayers,layersSize,numOfInstances,offsetIns,offsetWeights,offsetDeltas,momentum,devOldWeights,learningRate);
if(epochs_between_reports>0&&epoch%epochs_between_reports==0){
cudaMemcpy(weights,devWeights,numOfWeights*sizeof(float),cudaMemcpyDeviceToHost);
//float mseTrain=net->computeMSE(*trainingSet);
mseTrain=GPUComputeMSE(devValues,devWeights,actFuncts,numOfLayers,layersSize,numOfInstances,numOfOutputsPerInstance,devTrainingSetOutputs,offsetIns,offsetWeights,offsetOuts);
if(printtype==PRINT_ALL)
printf("Epoch\t%d\tError on train set %.10f",epoch,mseTrain);
if(testSet!=NULL){
//float mseTest=net->computeMSE(*testSet);
mseTest=GPUComputeMSE(devTestValues,devWeights,actFuncts,numOfLayers,layersSize,numOfTestInstances,numOfOutputsPerInstance,devTestSetOutputs,offsetTestIns,offsetWeights,offsetTestOuts);
if(mseTest<bestMSETest){
bestMSETest=mseTest;
if(bestMSETestNet!=NULL){
*bestMSETestNet=*net;
}
}
if((mseTrain+mseTest)<bestMSETrainTest&&bestMSETrainTestNet!=NULL){
*bestMSETrainTestNet=*net;
bestMSETrainTest=mseTrain+mseTest;
}
if(printtype==PRINT_ALL)
printf("\t\tError on test set %.10f",mseTest);
if(bestClassTestNet!=NULL){
//float per=net->classificatePerc(*testSet);
float per=GPUclassificatePerc(devTestValues,devWeights,actFuncts,numOfLayers,layersSize,numOfTestInstances,numOfOutputsPerInstance,devTestSetOutputs,offsetTestIns,offsetWeights,offsetTestOuts);
if(printtype==PRINT_ALL)
printf("\t\tClassification percentage on test set: %.1f%%",per*100);
if(per>bestClassTest){
*bestClassTestNet=*net;
bestClassTest=per;
if(printtype==PRINT_ALL)
printf(" ***");
}
}
if(mseTest<=desired_error){
if(printtype==PRINT_ALL)
printf("\nDesired error reached on test set.\n");
break;
}
}
if(printtype==PRINT_ALL)
printf("\n");
if(mseTrain<=desired_error&&testSet==NULL){
if(printtype==PRINT_ALL)
printf("Desired error reached on training set.\n");
break;
}
}
}
cudaMemcpy(weights,devWeights,numOfWeights*sizeof(float),cudaMemcpyDeviceToHost);
//cublas deallocations
cublasFree (devValues);
cublasFree (devTestValues);
cublasFree (devDeltas);
cublasFree (devWeights);
cublasFree (devOldWeights);
cublasFree (devTrainingSetInputs);
cublasFree (devTrainingSetOutputs);
cublasFree (devTestSetInputs);
cublasFree (devTestSetOutputs);
cublasShutdown();
delete [] values;
delete [] testValues;
delete [] deltas;
delete [] oldWeights;
delete [] columnTrainingSetInputs;
delete [] columnTrainingSetOutputs;
delete [] columnTestSetInputs;
delete [] columnTestSetOutputs;
if(printtype==PRINT_ALL)
printf("Training complete.\n");
if(testSet!=NULL){
return bestMSETest;
}
else return mseTrain;
}
//computes a single instance forward of the backpropagation training
void FeedForwardNNTrainer::stepForward(float * values, const float * weights, const int * actFuncts, const int numOfLayers, const int * layersSize, const int numOfInputsPerInstance, const float * trainingSetInputs, const int * offsetIns, const int * offsetWeights, const int * offsetOuts, const int * order, const int instance){
//load an array of inputs
for(int i=0;i<numOfInputsPerInstance;i++)
values[i]=trainingSetInputs[order[instance]*numOfInputsPerInstance+i];
//loops the layers
for(int i=0;i<numOfLayers-1;i++){
//bias neuron
values[offsetIns[i]+layersSize[i]]=1.0;
float tot=0;
//loops the outputs
for(int j=0;j<layersSize[i+1];j++){
//unrolled sum of all to avoid some floating points precision problems
tot=0;
int k=(layersSize[i]+1)%4;
switch (k){
case 3:tot += weights[2+j*(layersSize[i]+1)+offsetWeights[i]]*values[2+offsetIns[i]];
case 2:tot += weights[1+j*(layersSize[i]+1)+offsetWeights[i]]*values[1+offsetIns[i]];
case 1:tot += weights[j*(layersSize[i]+1)+offsetWeights[i]]*values[offsetIns[i]];
case 0:break;
}
for(;k<layersSize[i]+1;k+=4){
tot+= weights[k+j*(layersSize[i]+1)+offsetWeights[i]]*values[k+offsetIns[i]]+
weights[k+1+j*(layersSize[i]+1)+offsetWeights[i]]*values[k+1+offsetIns[i]]+
weights[k+2+j*(layersSize[i]+1)+offsetWeights[i]]*values[k+2+offsetIns[i]]+
weights[k+3+j*(layersSize[i]+1)+offsetWeights[i]]*values[k+3+offsetIns[i]];
}
//write the ouputs of the layer
values[j+offsetOuts[i]]=actFunction(actFuncts[i+1],tot);
}
}
}
//computes a single instance backward of the backpropagation training
void FeedForwardNNTrainer::stepBack(const float * values, const float * weights, float * deltas, const int * actFuncts, const int numOfLayers, const int * layersSize, const int numOfOutputsPerInstance, const float * trainingSetOutputs, const int * offsetWeights, const int * offsetDeltas, const int * offsetOuts, const int * order, const int instance, const int errorFunc){
//loop layers backwards (from last hidden to inputs)
for(int i=numOfLayers-2;i>=0;i--){
//output layer (different rule) and no bias (for nextLayerSize)
if(i==numOfLayers-2){
for(int j=0;j<layersSize[i+1];j++){
float error=(trainingSetOutputs[j+order[instance]*numOfOutputsPerInstance]-values[j+offsetOuts[i]])/spanSize(actFuncts[i+1]);
deltas[j+offsetDeltas[i]]=actDerivation(actFuncts[i+1],values[j+offsetOuts[i]])*errorFunction(error,errorFunc);
}
}
//normal hidden layer
else{
//new efficent code
float tmpErrors[layersSize[i+1]+1];
for(int j=0;j<layersSize[i+1]+1;j++)
tmpErrors[j]=0;
for(int k=0;k<layersSize[i+2];k++){
float precalc=deltas[k+offsetDeltas[i+1]];
for(int j=0;j<layersSize[i+1]+1;j++){
//next layer's delta and weights are used
tmpErrors[j]+=precalc*weights[j+k*(layersSize[i+1]+1)+offsetWeights[i+1]];
}
}
for(int j=0;j<layersSize[i+1]+1;j++){
deltas[j+offsetDeltas[i]]=actDerivation(actFuncts[i+1],values[j+offsetOuts[i]])*tmpErrors[j];
}
}
}
}
//update the weights using the deltas
void FeedForwardNNTrainer::weightsUpdate(const float * values, const float * weights, float * weightsToUpdate, const float * deltas, const int numOfLayers, const int * layersSize, const int * offsetIns, const int * offsetWeights, const int * offsetDeltas, const float momentum, float * oldWeights, float learningRate){
//loops the layers
if(momentum>0)
for(int i=0;i<numOfLayers-1;i++){
for(int k=0;k<layersSize[i+1];k++){
//efficient code to speed up the backpropagation
float tempLxD=learningRate*deltas[k+offsetDeltas[i]];
int wOffset=k*(layersSize[i]+1)+offsetWeights[i];
int vOffset=offsetIns[i];
for(int j=0;j<layersSize[i]+1;j++){
float auxWeight=weights[j+wOffset];
weightsToUpdate[j+wOffset]+=tempLxD*values[j+vOffset]+momentum*(auxWeight-oldWeights[j+wOffset]);
oldWeights[j+wOffset]=auxWeight;
}
}
}
else
for(int i=0;i<numOfLayers-1;i++){
for(int k=0;k<layersSize[i+1];k++){
//efficient code to speed up the backpropagation
float tempLxD=learningRate*deltas[k+offsetDeltas[i]];
int wOffset=k*(layersSize[i]+1)+offsetWeights[i];
int vOffset=offsetIns[i];
for(int j=0;j<layersSize[i]+1;j++){
weightsToUpdate[j+wOffset]+=tempLxD*values[j+vOffset];
}
}
}
}
//GPU computes all the instances forward of the backpropagation training
void FeedForwardNNTrainer::GPUForward(float * devValues, const float * devWeights, const int * actFuncts, const int numOfLayers, const int * layersSize, const int numOfInstances, const int * offsetIns, const int * offsetWeights, const int * offsetOuts){
//loops the layers
for(int i=0;i<numOfLayers-1;i++){
int ninput = numOfInstances;
int naux = layersSize[i]+1;
int noutput = layersSize[i+1];
const float * devPtrA;
const float * devPtrB;
float * devPtrC;
devPtrA=devValues+offsetIns[i];
devPtrB=devWeights+offsetWeights[i];
devPtrC=devValues+offsetOuts[i];
//does the product of the neurons matrix and the weights matrix
//the weights matrix is row-major so no translation is necessary
cublasSgemm ('n','n',
ninput, noutput, naux,
1, devPtrA, ninput,
devPtrB, naux,
0, devPtrC, ninput
);
computeActFunct(devPtrC,ninput*noutput,actFuncts[i+1]);
}
}
//GPU computes all the instances backward of the backpropagation training
void FeedForwardNNTrainer::GPUBack(const float * devValues,const float * devWeights,float * devDeltas,const int * actFuncts,const int numOfLayers,const int *layersSize,const int numOfInstances,const int numOfOutputsPerInstance,const float * devTrainingSetOutputs,const int *offsetWeights,const int *offsetDeltas,const int * offsetOuts, const int errorFunc){
//loop layers backwards (from last hidden to inputs)
for(int i=numOfLayers-2;i>=0;i--){
//output layer (different rule) and no bias (for nextLayerSize)
if(i==numOfLayers-2){
computeError(devDeltas+offsetDeltas[i],devTrainingSetOutputs,devValues+offsetOuts[i],numOfInstances*numOfOutputsPerInstance,actFuncts[i+1],errorFunc);
}
//normal hidden layer
else{
int ninput = numOfInstances;
int naux = layersSize[i+2];
int noutput = layersSize[i+1]+1;
const float * devPtrA;
const float * devPtrB;
float * devPtrC;
devPtrA=devDeltas+offsetDeltas[i+1];
devPtrB=devWeights+offsetWeights[i+1];
devPtrC=devDeltas+offsetDeltas[i];
//does the product of the deltas matrix and the weights matrix
//the weights matrix is row-major so must be translated to multiply. also the index is noutput
cublasSgemm ('n','t',
ninput, noutput, naux,
1, devPtrA, ninput,
devPtrB, noutput,
0, devPtrC, ninput
);
}
computeDerivFunct(devDeltas+offsetDeltas[i],devValues+offsetOuts[i],numOfInstances*layersSize[i+1],actFuncts[i+1]);
}
}
//GPU updates the weights for all the instances
void FeedForwardNNTrainer::GPUUpdate(const float * devValues,float * devWeights,const float *devDeltas, const int numOfLayers, const int * layersSize, const int numOfInstances, const int * offsetIns,const int * offsetWeights,const int * offsetDeltas,const float momentum,float * devOldWeights,const float learningRate){
//loops the layers
for(int i=0;i<numOfLayers-1;i++){
int ninput = layersSize[i]+1;
int naux = numOfInstances;
int noutput = layersSize[i+1];
const float * devPtrA;
const float * devPtrB;
float * devPtrC;
devPtrA=devValues+offsetIns[i];
devPtrB=devDeltas+offsetDeltas[i];
devPtrC=devWeights+offsetWeights[i];
if(momentum>0){
//if there's a momentum it updates the weights with a portion of the difference with the old weights
addMomentum(devWeights+offsetWeights[i],devOldWeights+offsetWeights[i],layersSize[i+1]*(layersSize[i]+1),momentum);
}
//does the product of neurons matrix and the deltas matrix and add them to weights matrix (after multiplying with learning rate and dividing by nOfIstances)
//the neurons matrix is translated to multiply
cublasSgemm ('t','n',
ninput, noutput, naux,
learningRate/(float)numOfInstances, devPtrA, naux,
devPtrB, naux,
1, devPtrC, ninput
);
}
}
//GPU computes the MSE on a set
float FeedForwardNNTrainer::GPUComputeMSE(float * devValues, const float * devWeights, const int * actFuncts, const int numOfLayers, const int * layersSize, const int numOfInstances, const int numOfOutputsPerInstance,const float * devSetOutputs,const int * offsetIns, const int * offsetWeights, const int * offsetOuts){
//loops the layers
for(int i=0;i<numOfLayers-1;i++){
int ninput = numOfInstances;
int naux = layersSize[i]+1;
int noutput = layersSize[i+1];
const float * devPtrA;
const float * devPtrB;
float * devPtrC;
devPtrA=devValues+offsetIns[i];
devPtrB=devWeights+offsetWeights[i];
devPtrC=devValues+offsetOuts[i];
//does the product of the neurons matrix and the weights matrix
//the weights matrix is row-major so no translation is necessary
cublasSgemm ('n','n',
ninput, noutput, naux,
1, devPtrA, ninput,
devPtrB, naux,
0, devPtrC, ninput
);
computeActFunct(devPtrC,ninput*noutput,actFuncts[i+1]);
}
return mseError(devSetOutputs,devValues+offsetOuts[numOfLayers-2],numOfInstances*numOfOutputsPerInstance,actFuncts[numOfLayers-1]);
}
//GPU computes the classification percentage on a set
float FeedForwardNNTrainer::GPUclassificatePerc(float * devValues, const float * devWeights, const int * actFuncts, const int numOfLayers, const int * layersSize, const int numOfInstances, const int numOfOutputsPerInstance,float * devSetOutputs,const int * offsetIns, const int * offsetWeights, const int * offsetOuts){
//loops the layers
for(int i=0;i<numOfLayers-1;i++){
int ninput = numOfInstances;
int naux = layersSize[i]+1;
int noutput = layersSize[i+1];
const float * devPtrA;
const float * devPtrB;
float * devPtrC;
devPtrA=devValues+offsetIns[i];
devPtrB=devWeights+offsetWeights[i];
devPtrC=devValues+offsetOuts[i];
//does the product of the neurons matrix and the weights matrix
//the weights matrix is row-major so no translation is necessary
cublasSgemm ('n','n',
ninput, noutput, naux,
1, devPtrA, ninput,
devPtrB, naux,
0, devPtrC, ninput
);
computeActFunct(devPtrC,ninput*noutput,actFuncts[i+1]);
}
int valuesIndexes[numOfInstances];
int outputIndexes[numOfInstances];
int * devValuesIndexes;
int * devOutputIndexes;
cudaMalloc((void **)&devValuesIndexes,numOfInstances*sizeof(int));
cudaMalloc((void **)&devOutputIndexes,numOfInstances*sizeof(int));
float * tmpTranslate;
cudaMalloc((void **)&tmpTranslate,numOfInstances*numOfOutputsPerInstance*sizeof(int));
//translate the output neurons matrix from column major to row major
translateMatrix(numOfInstances,numOfOutputsPerInstance,devValues+offsetOuts[numOfLayers-2],tmpTranslate);
//and evaluates the max of each row for classification
computeMaxes(numOfInstances,numOfOutputsPerInstance,tmpTranslate,devValuesIndexes);
//translate the desired outputs matrix from column major to row major
translateMatrix(numOfInstances,numOfOutputsPerInstance,devSetOutputs,tmpTranslate);
//and evaluates the max of each row for classification
computeMaxes(numOfInstances,numOfOutputsPerInstance,tmpTranslate,devOutputIndexes);
cudaFree(tmpTranslate);
cudaMemcpy(valuesIndexes,devValuesIndexes,numOfInstances*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(outputIndexes,devOutputIndexes,numOfInstances*sizeof(int),cudaMemcpyDeviceToHost);
//compute the actual rate comparing the correct classification and the one of the net
int cont=0;
for(int i=0;i<numOfInstances;i++){
if(valuesIndexes[i]==outputIndexes[i])cont++;
}
cudaFree(devValuesIndexes);
cudaFree(devOutputIndexes);
return (float)cont/(float)numOfInstances;
}