Neural network dropout

import random

import numpy as np

import matplotlib.pyplot as plt

import sklearn.metrics as met

import sklearn.preprocessing as pre

import csv

class CA(object):

def __init__(self, sizes=None, drop_rates=None):

self.sizes = sizes

self.num_layers = len(self.sizes)

#the rate of dropping neurons for every layer

self.drop_rates = drop_rates #last layer (output) has no drop, first layer (input) can have drops, e.g. [0.5,0.5,0.5,0]

self.biases = [np.random.randn(i, 1) for i in self.sizes[1:]]

self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]

def feed_forward(self, x):

''' wx + b calculation, use relu for all layers except the last'''

for b, w in zip(self.biases[:-1], self.weights[:-1]):

x = self.relu(np.dot(w, x)+b)

'''use softmax in the last layer'''

x = self.softmax(np.dot(self.weights[-1], x)+self.biases[-1])

return x

def back_prop(self, x, y, weight_filters, layer_filters):

#layer filters contains #layers -1 elements, going from layer 0 to layer -2

#weight filters contains #layers -1 elements, going from layer 1 to layer -1

#initialize the gradient w.r.t weights and biases for the ordinary layers

nabla_b = [np.zeros(b.shape) for b in self.biases]

nabla_w = [np.zeros(w.shape) for w in self.weights]

zs = [] #the array to keep track of the original wx+b (z values) for each layer after the input layer

activation = x * layer_filters[0] #the input layer's activation value, i.e. the input values

activations = [activation] #the array to keep track of the activation value of each layer.

'''feed forward, ordinary layers '''

#run feed forward first to get the z values and the activation values for all layers except the last oner

for b, w, lf, wf in zip(self.biases[:-1], self.weights[:-1], layer_filters[1:], weight_filters[:-1]):

z = (np.dot(w * wf, activation) + b) * lf #get to the next layer by calculating wx + b.

zs.append(z)

activation = self.relu(z) * lf #activate the neuron, no need to * lf again, but leave it there in case the activation function is changed.

activations.append(activation) #keep track of the output of each layer

'''feed forward, the last layer'''

z = np.dot(self.weights[-1], activation) + self.biases[-1] #no drop out for the last layer so no need to times the weight filter

zs.append(z)

activation = self.softmax(z) #activate the neuron

activations.append(activation)

'''back propagation, last layer'''

#calculate the Error (delta) of the last layer

#refer to negative log loss and softmax

delta = activations[-1] - y

#the gradient w.r.t bias equals delta

nabla_b[-1] = delta

#the gradient w.r.t. weight in layer L = the Error * activation from layer L-1

#delta is a column vector, activation is also a column vector

#delta <dot> activation.transpose yields a matrix that keeps the weights from every neuron in L-1 to every neuron in L.

#the activations[-2] has the dropped neurons zeroed out already

nabla_w[-1] = np.dot(delta, (activations[-2]).transpose())

'''back propagation, other layers till the first'''

#use back propagation to calculate the Error for the previous layers

#Also the biases and weights for the previous layers back to the first layer

#delta_L = (w_L+1)^T <dot> delta_L+1 <elementwise multiplication> activation'(z)

#note the loop accesses array element through negative index

for i in range(2, self.num_layers):

layer = - i #negative index

#exlude the weights of dropped neurons in layer L+1, and zero out the deltas of layer L by timing layer_filter

#layer filters doesn't have a filter for the last layer, so use "layer+1"

delta = np.dot( (self.weights[layer + 1] * weight_filters[layer + 1]).transpose(), delta) * self.relu_derivative(zs[layer]) * layer_filters[layer+1]

nabla_b[layer] = delta

nabla_w[layer] = np.dot(delta, activations[layer-1].transpose())

return (nabla_b, nabla_w)

'''

update the weights w and bias b using a mini batch of training sample

calcualte gradient for each sample (x,y) separately and then average the sum to approximate the actual gradient

'''

def update_mini_batch(self, mini_batch, eta, n, lmbda):

#initialize the gradient of Cost w.r.t. biases and weight

sum_nabla_b = [np.zeros(b.shape) for b in self.biases]

sum_nabla_w = [np.zeros(w.shape) for w in self.weights]

#iterate through every training sample in the mini batch

#calculate the gradient for each of the samples and sum up

for x, y in mini_batch:

#apply drop rate to get a weight filter

#ignore the first input layer so drops_rates starts from 1.

weight_filters =[] #filter on matrix columns. first layer has no weight

layer_filters =[] #indicate which neurons are dropped per layer, last layer has no drop out

#apply random drop out per training sample, not per mini batch

for dr, dr_next, size, size_next in zip(self.drop_rates[:-1], self.drop_rates[1:], self.sizes[:-1], self.sizes[1:]):

weight_col_filter = np.random.rand(size) #a list of random numbers between 0 and 1

weight_col_filter = np.where(weight_col_filter >= dr, 1, 0) #drop dr%, the surving element assigned 1, else 0

weight_row_filter = np.random.rand(size_next)

weight_row_filter = np.where(weight_row_filter >= dr_next, 1, 0) #drop dr%, the surving element assigned 1, else 0

#keep the filters for each layer, except the last layer

layer_filters.append(np.reshape(weight_col_filter, (len(weight_col_filter), 1)))

weight_col_filter = np.reshape(weight_col_filter, (1, len(weight_col_filter))) #reshape as a row

weight_col_filter = np.repeat(weight_col_filter, size_next, axis=0) #repeat the row

weight_row_filter = np.reshape(weight_row_filter, (len(weight_row_filter),1)) #reshape as a column

weight_row_filter = np.repeat(weight_row_filter, size, axis=1) #repeat the column vector

#keep the weight filters (mask)

weight_filters.append(weight_col_filter * weight_row_filter)

#back propagation to calculate the gradient for sample x, y

delta_nabla_b, delta_nabla_w = self.back_prop(x,y, weight_filters, layer_filters)

#sum up the gradient calculated from each training sample and take the average (later) outside of the loop

sum_nabla_b = [nb + dnb for nb, dnb in zip(sum_nabla_b, delta_nabla_b)]

sum_nabla_w = [nw + dnw for nw, dnw in zip(sum_nabla_w, delta_nabla_w)]

#take the average of the gradients of all samples, some people omit averaging it and it's usually ok.

#the average here is considered as an estimate of the actual gradient

avg_nabla_b = [dnb/len(mini_batch) for dnb in sum_nabla_b]

avg_nabla_w = [dnw/len(mini_batch) for dnw in sum_nabla_w]

#when applying drop out, each mini batch is training an independent subnet, but the weights and biases from different subnets will be added up

#so rescale the weights and biases proportationally to a smaller. e.g. drop out = 50% then the weights * 50%

#adjust the weights and bias by moving a delta in the opposite direction of gradient.

#lmbda/n * w is the derivative of the L2 penalty term

#self.weights = [ w - eta * nw - eta * lmbda/n * w for w, nw, dr in zip(self.weights, avg_nabla_w)]

#self.weights = [ w - eta * nw for w, nw in zip(self.weights, avg_nabla_w)] #no penalty term

#rescale the weights due to dropout, note, times the outgoing weight with drop rate, not the incoming weights

self.weights = [ w - eta * nw * (1-dr) - eta * lmbda/n * w *(1-dr) for w, nw, dr in zip(self.weights, avg_nabla_w, self.drop_rates[:-1])]

self.biases = [ b - eta * nb for b, nb in zip(self.biases, avg_nabla_b)]

#rescale the biases, which seems not necessary, note to use drop rate of the current layer

#self.biases = [ (b - eta * nb) * (1-dr) for b, nb, dr in zip(self.biases, avg_nabla_b, self.drop_rates[1:])]

'''

stochastic gradient descent sgd

every round of training uses a random mini batch (subset) of training data.

Pre-divide the training data into mini batches and use them one after another.

once all mini batches have been used, it is said an epoch is done

then shuffle the training data randomly again and start another epoch

the function stops when the specified number of epochs of training have been done.

the test data (if available) is for tracking the performance of each epoch

the testing and printing result slows down the process a lot so comment it out if not needed.

'''

def sgd(self, training_data, epochs, mini_batch_size, eta, lmbda, test_data=None, early_stop_rounds=0):

n = len(training_data)

evals = []

models = []

for i in range(epochs):

#re-shuffle the training data every epoch

random.shuffle(training_data)

#divide the training data set into mini batches as per the mini batch size

mini_batches = [training_data[j:j+mini_batch_size] for j in range(0, len(training_data), mini_batch_size)]

#adjust weights and biases every mini batch

for mini_batch in mini_batches:

#update the weights w and biases b by moving one step 'eta' using a mini batch of data

self.update_mini_batch(mini_batch, eta, n, lmbda)

#print out the current performance if test data available

if test_data:

accuracy = self.evaluate_precision(test_data)

auc = self.evaluate_auc(test_data)

print("Epoch {0}: precision: {1} auc: {2}".format(i, accuracy, auc))

if early_stop_rounds > 0:

evals.append(auc)

models.append((self.weights, self.biases)) #store the model

if len(evals) > early_stop_rounds:

max_eval = max(evals[-early_stop_rounds:]) #the max of the last x rounds

if max_eval <= evals[-early_stop_rounds-1]: #the max is no better than before

best_epoch = i - early_stop_rounds

print('Early stop. Restore to Epoch {0}'.format(best_epoch))

self.weights, self.biases = models[best_epoch]

break

else:

print("Epoch {0} done".format(i))

def relu(self, z):

#return np.where(z > 0, z, 0)

return np.maximum(0, z)

def relu_derivative(self, z):

#return np.where(z > 0, 1, 0)

return np.greater(z, 0).astype(int)

'''

the activation for the last layer

The C is for avoiding overflow

'''

def softmax(self, z):

C = np.max(z)

expz = np.exp(z - C)

return expz / np.sum(expz)

'''

test the current neural network with test_data and returns the % of correct predictions

'''

def evaluate_precision(self, test_data):

#return the index of the largest output value for each test sample

#the index is from 0 to 9 so index represents the actual recognised number

test_results = [(np.argmax(self.feed_forward(x)), np.argmax(y)) for (x,y) in test_data]

#return the % of correct predictions

return sum(int(x==y) for (x,y) in test_results) / float(len(test_data))

def evaluate_auc(self, test_data):

test_scores = [self.feed_forward(x)[1,0] for (x,y) in test_data] #the score of being 1

y = [np.argmax(y) for (x,y) in test_data]

return met.roc_auc_score(y, test_scores, average='macro')

def vectorize(y, length):

vector = np.zeros((length,1))

vector[y] = 1.0

return vector

print('========== start running main method ==========')

#load the training data from csv file

Train_Data_Path = r'C:\Python\training data backup\training data.csv'

f = open(Train_Data_Path, 'r')

reader = csv.reader(f)

data = list(reader)

tra = data[1:]

tra_array = np.array(tra)

f.close()

input_neurons = tra_array.shape[1] - 2 #the first two columns are id and label

#split the data from the label

y = [vectorize(y, 2) for y in tra_array[:,1].astype(int)]

X = tra_array[:, 2:].astype(float)

'''standard scaler, used later for test data as well'''

scaler = pre.StandardScaler().fit(X)

X = scaler.transform(X) #standarization

X = [x.reshape((input_neurons,1)) for x in X] #transform to np array

train = list(zip(X, y))

#load the validation data from csv file

Test_Data_Path = r'C:\Python\training data backup\validation data.csv'

f = open(Test_Data_Path, 'r')

reader = csv.reader(f)

data = list(reader)

val = data[1:]

val_array = np.array(val)

columns = data[0][2:] #column names, the 1st and 2nd are id and label

columns_array = np.array(columns).astype(str)

f.close()

#split the data from the label

f = [vectorize(f, 2) for f in val_array[:,1].astype(int)]

Z = scaler.transform(val_array[:, 2:].astype(float)) #standarization

Z = [z.reshape((input_neurons,1)) for z in Z]

test = list(zip(Z, f))

print('========== finish loading data ==========')

print('========== start training ==========')

ca = CA(sizes = [size_first_layer, 1000, 300, 10, 2], drop_rates=[0.8, 0.3, 0.3, 0.1, 0])

ca.sgd(train, epochs=100, mini_batch_size=20, eta=0.02, lmbda = 0.1, test_data = test, early_stop_rounds=10)

print('========== training is done ==========')

Page updated

Google Sites

Report abuse