import random
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as met
import sklearn.preprocessing as pre
import csv
class CA(object):
def __init__(self, sizes=None, drop_rates=None):
self.sizes = sizes
self.num_layers = len(self.sizes)
#the rate of dropping neurons for every layer
self.drop_rates = drop_rates #last layer (output) has no drop, first layer (input) can have drops, e.g. [0.5,0.5,0.5,0]
self.biases = [np.random.randn(i, 1) for i in self.sizes[1:]]
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def feed_forward(self, x):
''' wx + b calculation, use relu for all layers except the last'''
for b, w in zip(self.biases[:-1], self.weights[:-1]):
x = self.relu(np.dot(w, x)+b)
'''use softmax in the last layer'''
x = self.softmax(np.dot(self.weights[-1], x)+self.biases[-1])
return x
def back_prop(self, x, y, weight_filters, layer_filters):
#layer filters contains #layers -1 elements, going from layer 0 to layer -2
#weight filters contains #layers -1 elements, going from layer 1 to layer -1
#initialize the gradient w.r.t weights and biases for the ordinary layers
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
zs = [] #the array to keep track of the original wx+b (z values) for each layer after the input layer
activation = x * layer_filters[0] #the input layer's activation value, i.e. the input values
activations = [activation] #the array to keep track of the activation value of each layer.
'''feed forward, ordinary layers '''
#run feed forward first to get the z values and the activation values for all layers except the last oner
for b, w, lf, wf in zip(self.biases[:-1], self.weights[:-1], layer_filters[1:], weight_filters[:-1]):
z = (np.dot(w * wf, activation) + b) * lf #get to the next layer by calculating wx + b.
zs.append(z)
activation = self.relu(z) * lf #activate the neuron, no need to * lf again, but leave it there in case the activation function is changed.
activations.append(activation) #keep track of the output of each layer
'''feed forward, the last layer'''
z = np.dot(self.weights[-1], activation) + self.biases[-1] #no drop out for the last layer so no need to times the weight filter
zs.append(z)
activation = self.softmax(z) #activate the neuron
activations.append(activation)
'''back propagation, last layer'''
#calculate the Error (delta) of the last layer
#refer to negative log loss and softmax
delta = activations[-1] - y
#the gradient w.r.t bias equals delta
nabla_b[-1] = delta
#the gradient w.r.t. weight in layer L = the Error * activation from layer L-1
#delta is a column vector, activation is also a column vector
#delta <dot> activation.transpose yields a matrix that keeps the weights from every neuron in L-1 to every neuron in L.
#the activations[-2] has the dropped neurons zeroed out already
nabla_w[-1] = np.dot(delta, (activations[-2]).transpose())
'''back propagation, other layers till the first'''
#use back propagation to calculate the Error for the previous layers
#Also the biases and weights for the previous layers back to the first layer
#delta_L = (w_L+1)^T <dot> delta_L+1 <elementwise multiplication> activation'(z)
#note the loop accesses array element through negative index
for i in range(2, self.num_layers):
layer = - i #negative index
#exlude the weights of dropped neurons in layer L+1, and zero out the deltas of layer L by timing layer_filter
#layer filters doesn't have a filter for the last layer, so use "layer+1"
delta = np.dot( (self.weights[layer + 1] * weight_filters[layer + 1]).transpose(), delta) * self.relu_derivative(zs[layer]) * layer_filters[layer+1]
nabla_b[layer] = delta
nabla_w[layer] = np.dot(delta, activations[layer-1].transpose())
return (nabla_b, nabla_w)
'''
update the weights w and bias b using a mini batch of training sample
calcualte gradient for each sample (x,y) separately and then average the sum to approximate the actual gradient
'''
def update_mini_batch(self, mini_batch, eta, n, lmbda):
#initialize the gradient of Cost w.r.t. biases and weight
sum_nabla_b = [np.zeros(b.shape) for b in self.biases]
sum_nabla_w = [np.zeros(w.shape) for w in self.weights]
#iterate through every training sample in the mini batch
#calculate the gradient for each of the samples and sum up
for x, y in mini_batch:
#apply drop rate to get a weight filter
#ignore the first input layer so drops_rates starts from 1.
weight_filters =[] #filter on matrix columns. first layer has no weight
layer_filters =[] #indicate which neurons are dropped per layer, last layer has no drop out
#apply random drop out per training sample, not per mini batch
for dr, dr_next, size, size_next in zip(self.drop_rates[:-1], self.drop_rates[1:], self.sizes[:-1], self.sizes[1:]):
weight_col_filter = np.random.rand(size) #a list of random numbers between 0 and 1
weight_col_filter = np.where(weight_col_filter >= dr, 1, 0) #drop dr%, the surving element assigned 1, else 0
weight_row_filter = np.random.rand(size_next)
weight_row_filter = np.where(weight_row_filter >= dr_next, 1, 0) #drop dr%, the surving element assigned 1, else 0
#keep the filters for each layer, except the last layer
layer_filters.append(np.reshape(weight_col_filter, (len(weight_col_filter), 1)))
weight_col_filter = np.reshape(weight_col_filter, (1, len(weight_col_filter))) #reshape as a row
weight_col_filter = np.repeat(weight_col_filter, size_next, axis=0) #repeat the row
weight_row_filter = np.reshape(weight_row_filter, (len(weight_row_filter),1)) #reshape as a column
weight_row_filter = np.repeat(weight_row_filter, size, axis=1) #repeat the column vector
#keep the weight filters (mask)
weight_filters.append(weight_col_filter * weight_row_filter)
#back propagation to calculate the gradient for sample x, y
delta_nabla_b, delta_nabla_w = self.back_prop(x,y, weight_filters, layer_filters)
#sum up the gradient calculated from each training sample and take the average (later) outside of the loop
sum_nabla_b = [nb + dnb for nb, dnb in zip(sum_nabla_b, delta_nabla_b)]
sum_nabla_w = [nw + dnw for nw, dnw in zip(sum_nabla_w, delta_nabla_w)]
#take the average of the gradients of all samples, some people omit averaging it and it's usually ok.
#the average here is considered as an estimate of the actual gradient
avg_nabla_b = [dnb/len(mini_batch) for dnb in sum_nabla_b]
avg_nabla_w = [dnw/len(mini_batch) for dnw in sum_nabla_w]
#when applying drop out, each mini batch is training an independent subnet, but the weights and biases from different subnets will be added up
#so rescale the weights and biases proportationally to a smaller. e.g. drop out = 50% then the weights * 50%
#adjust the weights and bias by moving a delta in the opposite direction of gradient.
#lmbda/n * w is the derivative of the L2 penalty term
#self.weights = [ w - eta * nw - eta * lmbda/n * w for w, nw, dr in zip(self.weights, avg_nabla_w)]
#self.weights = [ w - eta * nw for w, nw in zip(self.weights, avg_nabla_w)] #no penalty term
#rescale the weights due to dropout, note, times the outgoing weight with drop rate, not the incoming weights
self.weights = [ w - eta * nw * (1-dr) - eta * lmbda/n * w *(1-dr) for w, nw, dr in zip(self.weights, avg_nabla_w, self.drop_rates[:-1])]
self.biases = [ b - eta * nb for b, nb in zip(self.biases, avg_nabla_b)]
#rescale the biases, which seems not necessary, note to use drop rate of the current layer
#self.biases = [ (b - eta * nb) * (1-dr) for b, nb, dr in zip(self.biases, avg_nabla_b, self.drop_rates[1:])]
'''
stochastic gradient descent sgd
every round of training uses a random mini batch (subset) of training data.
Pre-divide the training data into mini batches and use them one after another.
once all mini batches have been used, it is said an epoch is done
then shuffle the training data randomly again and start another epoch
the function stops when the specified number of epochs of training have been done.
the test data (if available) is for tracking the performance of each epoch
the testing and printing result slows down the process a lot so comment it out if not needed.
'''
def sgd(self, training_data, epochs, mini_batch_size, eta, lmbda, test_data=None, early_stop_rounds=0):
n = len(training_data)
evals = []
models = []
for i in range(epochs):
#re-shuffle the training data every epoch
random.shuffle(training_data)
#divide the training data set into mini batches as per the mini batch size
mini_batches = [training_data[j:j+mini_batch_size] for j in range(0, len(training_data), mini_batch_size)]
#adjust weights and biases every mini batch
for mini_batch in mini_batches:
#update the weights w and biases b by moving one step 'eta' using a mini batch of data
self.update_mini_batch(mini_batch, eta, n, lmbda)
#print out the current performance if test data available
if test_data:
accuracy = self.evaluate_precision(test_data)
auc = self.evaluate_auc(test_data)
print("Epoch {0}: precision: {1} auc: {2}".format(i, accuracy, auc))
if early_stop_rounds > 0:
evals.append(auc)
models.append((self.weights, self.biases)) #store the model
if len(evals) > early_stop_rounds:
max_eval = max(evals[-early_stop_rounds:]) #the max of the last x rounds
if max_eval <= evals[-early_stop_rounds-1]: #the max is no better than before
best_epoch = i - early_stop_rounds
print('Early stop. Restore to Epoch {0}'.format(best_epoch))
self.weights, self.biases = models[best_epoch]
break
else:
print("Epoch {0} done".format(i))
def relu(self, z):
#return np.where(z > 0, z, 0)
return np.maximum(0, z)
def relu_derivative(self, z):
#return np.where(z > 0, 1, 0)
return np.greater(z, 0).astype(int)
'''
the activation for the last layer
The C is for avoiding overflow
'''
def softmax(self, z):
C = np.max(z)
expz = np.exp(z - C)
return expz / np.sum(expz)
'''
test the current neural network with test_data and returns the % of correct predictions
'''
def evaluate_precision(self, test_data):
#return the index of the largest output value for each test sample
#the index is from 0 to 9 so index represents the actual recognised number
test_results = [(np.argmax(self.feed_forward(x)), np.argmax(y)) for (x,y) in test_data]
#return the % of correct predictions
return sum(int(x==y) for (x,y) in test_results) / float(len(test_data))
def evaluate_auc(self, test_data):
test_scores = [self.feed_forward(x)[1,0] for (x,y) in test_data] #the score of being 1
y = [np.argmax(y) for (x,y) in test_data]
return met.roc_auc_score(y, test_scores, average='macro')
def vectorize(y, length):
vector = np.zeros((length,1))
vector[y] = 1.0
return vector
print('========== start running main method ==========')
#load the training data from csv file
Train_Data_Path = r'C:\Python\training data backup\training data.csv'
f = open(Train_Data_Path, 'r')
reader = csv.reader(f)
data = list(reader)
tra = data[1:]
tra_array = np.array(tra)
f.close()
input_neurons = tra_array.shape[1] - 2 #the first two columns are id and label
#split the data from the label
y = [vectorize(y, 2) for y in tra_array[:,1].astype(int)]
X = tra_array[:, 2:].astype(float)
'''standard scaler, used later for test data as well'''
scaler = pre.StandardScaler().fit(X)
X = scaler.transform(X) #standarization
X = [x.reshape((input_neurons,1)) for x in X] #transform to np array
train = list(zip(X, y))
#load the validation data from csv file
Test_Data_Path = r'C:\Python\training data backup\validation data.csv'
f = open(Test_Data_Path, 'r')
reader = csv.reader(f)
data = list(reader)
val = data[1:]
val_array = np.array(val)
columns = data[0][2:] #column names, the 1st and 2nd are id and label
columns_array = np.array(columns).astype(str)
f.close()
#split the data from the label
f = [vectorize(f, 2) for f in val_array[:,1].astype(int)]
Z = scaler.transform(val_array[:, 2:].astype(float)) #standarization
Z = [z.reshape((input_neurons,1)) for z in Z]
test = list(zip(Z, f))
print('========== finish loading data ==========')
print('========== start training ==========')
ca = CA(sizes = [size_first_layer, 1000, 300, 10, 2], drop_rates=[0.8, 0.3, 0.3, 0.1, 0])
ca.sgd(train, epochs=100, mini_batch_size=20, eta=0.02, lmbda = 0.1, test_data = test, early_stop_rounds=10)
print('========== training is done ==========')