CNN dropout

"""

Created on 4/5/2019

@author: Ben C

This script trains a convolutional neural network for handwritten digit recognition.

Numpy and Scipy libraries are used for matrix calculation. 

The layers are implemented from scratch. No neural network library is used.

The training dataset is MNIST (https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz)

The network uses one convolution layer, one maxpooling layer, one hidden layer and an ouput layer.

Dropout and L2 regularization are applied to reduce overfitting.

Dropout is simulated by applying filter(mask) on the weight matrix. 

The dropped neurons' rows and columns in the matrix are zeroed.

A convolution layer with 5 x 5 kernels

followed by a 2 x 2 max pooling layer

Apply rectified linear unit activation on maxpooling's output

A hidden layer also using Relu activation function

A softmax layer as the output

Cost funciton = cross entropy (i.e. negative log likelihood)

use N(0, 1/n) to initialize weights

Super parameters are hardcoded in the script.

Each training epoch takes about 15 minutes on an i7 cpu laptop.

It only needs a few rounds to reach an accuracy of > 95%

"""

import random

import numpy as np

from scipy.ndimage import convolve

from skimage.measure import block_reduce

import pickle

import gzip

#import matplotlib.pyplot as plt

#import sys

import os

from datetime import datetime

class MNISTLearner(object):

    def __init__(self):

        

        '''

        the input are 28 x 28 = 784 pixels (feed directly into convolution layer)

        the output from convolution layer are 24 x 24 = 576 

        

        the 576 input feed directly into a 2x2 max pooling layer 

        the output from max pooling are 12 x 12 = 144

        Apply Rectified Linear Unit (Relu) on the max pooling output

        This may be different to other implementations that apply Relu on CNN output

        

        the 12x12 neurons from the max pooling output go straight to the next layer of 30

        Consider it as a dummy layer to hold the 144 neurons (logical)

        

        A hidden layer of 30 neurons

        

        A soft max layer of 10 neurons (representing 0-9)

        

        So layers are:

         28x28 x kernels -> 24x24 x kernels | 24x24 -> 12x12 (a logical layer of 12x12) | 30 | 10

        '''

        

        '''the first layer - convolutional layer'''

        #kernal shape

        self.kernel_shape = (5, 5)

        self.maxpool_shape = (2, 2)

        #the number of feature maps, i.e. the number of kernels

        self.num_kernels = 10 #20

        #the weights of the 5 x 5 kernel

        self.kernel_weights = [np.random.randn(self.kernel_shape[0], self.kernel_shape[1]) for i in range(self.num_kernels)]

        #the bias shared by the convolutional layer

        self.kernel_bias = [np.random.randn() for i in range(self.num_kernels)]

        

        '''the second layer - 2x2 max pooling layer'''

        #the max pooling layer simply receives 24x24=576 values from the convolution output

        #and map the 576 neurons to an output of 12x12=144 

        #apply relu on the 144 outputs

        #this is only logical, no coding

               

        '''the layers after the max pooling layer'''

        #the 144 neurons output from max pooling fully connect to the next layer

        #Consider a traditional NN starts from the 144 neurons

        #The calculation of convolutional and maxp layers' error & derivatives are different to the traditional networks

        self.sizes = [144 * self.num_kernels, 30, 10] 

        self.num_ordinary_layers = len(self.sizes) #Call it the ordinary layers (non official)

        self.drop_rates = [0, 0.5, 0] #dropout rate for the layers, no dropout for the input and output

        

        #bias for the hidden and output layers

        #randn generates random numbers from normal distribution with the shape (i, 1), i.e. column vector    

        #this doesn't include the convolutional and maxpooling layers

        self.biases = [np.random.randn(i, 1) for i in self.sizes[1:]]               

          

        #a list of matrixs for 2nd layer and after, this doesn't include the convolutional and maxpooling layers

        #weights = [weights to layer2, weights layer3, ... weights to last layer]

        #the first layer has no weights as it's only an input

        self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]

        #more explanation:

        #each weight matrix has a shape (k, j)

        #where j is the number of neurons in layer L-1, and k is the number of neurons in layer L (current layer)

        #matrix[k][j] is the weight of link from neuron j in the previous layer to neuron k in the current layer

        #in this way,given input data X as a column vector

        #then matrix <dot> X + bias is the output of the current layer, where X is the output of the previous layer

        # i.e. y = weights * x + b, this mathematically looks nice. 

        #initial weights:

        #initialize weights as N(0,1/n) where n is number of neurons in the previous layer

        #this is for initialising the weights properly to avoid the hidden layers saturate    

        #variance becomes 1/n. As var =(x-e(x))^2 and e(x)=0, x value becomes sqrt(1/n).    

   

    '''

    Feed forward: input an image, and go through convolutional layer, max pooling and subsequent matrix mutiplications 

    to get the activation value from the last layer. Loop through all layers.

    image is a 28 x 28 np array

    Here the image & convolutional layer and maxpooling layers use matrix but ordinary layers use column vector.

    '''

    def feed_forward(self, image):

        '''the first layer, convolutional layer, which takes in the image directly'''

        #the convolve function conducts the convolution operation

        #for matrix area not completely overlap with the kernel, the 'convolve()' still yields a value

        #keep only the overlapped area in the output

        outs = [convolve(image, w) + b for w, b in zip(self.kernel_weights, self.kernel_bias)]

        overlap = np.array(image.shape) - np.array(self.kernel_shape) + 1

        outs = [out[:overlap[0], :overlap[1]] for out in outs] # trim the area that is not completely overlapped with the kernel

        #!no activation operation here, leave it to the maxpooling layer!

        

        '''the second layer, 2x2 max pooling'''

        outs = np.array([block_reduce(o, (2,2), np.max) for o in outs]) #pick the max value out of every 2x2 square

        x = outs.reshape((outs.size,1)) #reshaped as a column vector, the output is passed onto the next ordinary layer 

        x = self.relu(x)  #apply activation function on the max poolings output

        

        '''continue wx + b calculation, use relu for all ordinary layers except the last'''

        for b, w in zip(self.biases[:-1], self.weights[:-1]):

            x = self.relu(np.dot(w, x)+b)

            

        '''use softmax in the last layer, so result resembles probability'''

        x = self.softmax(np.dot(self.weights[-1], x)+self.biases[-1])

        

        return x

       

    '''

    back propagation

    calculate the gradient with respect to weights and biases for every traning sample X,y

    the current weights, biases and training samples define the error for each neuron

    the error further defines the gradients

    the weight_filter and layer_filter are for excluding neurons, weights, biases that are randomly dropped out.

    e.g weight = [1,2,3,4,5]^T, weight_filter =[0,1,1,0,0]^T then weight * weight_filter means the 1st, 4th and 5th elements are droped out

    weight filter applies on the second layer to the last layer

    layer_filter applies on the second layer to the last layer 

    '''   

    def back_prop(self, image, y, weight_filter, layer_filter):

        #temp variables for keeping z values and activation values

        zs = []   

        activations = []

        

        #initialize the gradient w.r.t weights and biases for the ordinary layers

        nabla_b = [np.zeros(b.shape) for b in self.biases]

        nabla_w = [np.zeros(w.shape) for w in self.weights] 

           

        '''feed forward, convolutional layer and max pooling layer'''      

        #input layer & also the convolutional layer

        #apply convolution on the original image         

        outs = [convolve(image, w) + b for w, b in zip(self.kernel_weights, self.kernel_bias)]

        overlap = np.array(image.shape) - np.array(self.kernel_shape) + 1

        conv_outs = [out[:overlap[0], :overlap[1]] for out in outs] #trim the area that is not completely overlapped with the kernel

                      

        #max pooling layer

        maxpool_outs = np.array([block_reduce(out, (2,2), np.max) for out in conv_outs]) #pick the max value out of every 2x2 square

        z = maxpool_outs.reshape((maxpool_outs.size,1)) #reshaped as a column vector

        zs.append(z) #keep the reduced matrix

        activation = self.relu(z)  #apply activation function on the max poolings output

        activations.append(activation)        

        

        '''feed forward, ordinary layers '''      

        #run feed forward first to get the z values and the activation values for all layers except the last oner

        for b,w, lf, wf in zip(self.biases[:-1], self.weights[:-1], layer_filter[:-1], weight_filter[:-1]):

            z = (np.dot(w * wf, activation) + b) * lf #get to the next layer by calculating wx + b.

            zs.append(z)           

            activation = self.relu(z) * lf   #activate the neuron

            activations.append(activation) #keep track of the output of each layer

            

        '''feed forward, the last layer'''

        z = np.dot(self.weights[-1], activation) + self.biases[-1] #no drop out for the last layer

        zs.append(z)

        activation = self.softmax(z)   #activate the neuron

        activations.append(activation)

        

      

        '''back propagation, last layer'''         

        #calculate the Error (delta) of the last layer    

        #refer to negative log loss and softmax formulas

        delta = activations[-1] - y     

                

        #the gradient w.r.t bias equals delta

        nabla_b[-1] = delta

        

        #the gradient w.r.t. weight in layer L = the Error * activation from layer L-1

        #delta is a column vector, activation is also a column vector

        #delta <dot> activation.transpose yields a matrix that keeps the weights from every neuron in L-1 to every neuron in L.         

        #the activations[-2] has the dropped neurons zeroed out already, so no need to apply filters

        nabla_w[-1] = np.dot(delta, (activations[-2]).transpose())  

            

        '''back propagation, ordinary layers before the last'''  

        #use back propagation to calculate the Error for the previous layers

        #also the biases and weights for the previous layers back to the first ordinary layer (after maxpooling)

        #delta_L = (w_L+1)^T <dot> delta_L+1 <elementwise multiplication> activation'(z)

        #!note the loop accesses array element through negative index!

        for i in range(2, self.num_ordinary_layers):

            layer = - i #negative index

            #exclude the weights of dropped neurons in layer L+1, and zero out the deltas of layer L by timing layer_filter

            delta = np.dot( (self.weights[layer + 1] * weight_filter[layer + 1]).transpose(), delta) * self.relu_prime(zs[layer]) * layer_filter[layer]

            nabla_b[layer] = delta

            nabla_w[layer] = np.dot(delta, activations[layer-1].transpose())            

        #!!here we have the derivatives w.r.t. weights of all links going from the 144 neurons (output from maxp) to the 30 neurons (next ordinary layer)       

        

        

        '''back propagation, max pooling layer'''

        #Propagate the error (delta) back to the max pooling layer

        #the links to max polling are different to the links to an ordinary layer

        #so no need to calcuclate nabla_b or nabla_w for maxp here, leave it to the convolutinal layer

        delta = np.dot(self.weights[0].transpose(), delta) * self.relu_prime(zs[0])

        

        #reshape the maxpooling layer's delta vector to the 12x12 matrix shape

        #note these are for the output neurons of max pooling

        delta = delta.reshape(maxpool_outs.shape) 

        

        #calculate the error of maxpool's input neurons, 24 x 24      

        delta_maxp = np.array([np.zeros(co.shape) for co in conv_outs]) #the output from convolution, ie. input to max pool, initialized as zeros

        for k in range(delta.shape[0]):

            for i in range(delta.shape[1]):

                for j in range(delta.shape[2]):

                    #in maxp layer, the 4 neurons 2i+p, 2j+q (p, q in 0,1) map to one output neuron i, j

                    #get the index p,q of the max neuron in a 2x2 area

                    neurons2x2 = conv_outs[k][2*i:2*i+2,2*j:2*j+2] #get the corresponding 2x2 region in maxpool input, i.e. convolution output

                    max_pq = np.unravel_index(np.argmax(neurons2x2), neurons2x2.shape) #get the position (p, q) of the max value in the 2x2 region                

                    delta_maxp[k, 2*i + max_pq[0], 2*j + max_pq[1]] = delta[k, i, j]  #only the max neuron is passed on with the previous error, other neurons remain zero

        delta_maxp = [dm for dm in delta_maxp] #convert to a list

                

        ''' use max pooling layer's Error (delta) to calculate the kernel weights and biases for the convolutional layer  '''                      

        #derive the kernel_nabla_b 

        kernel_nabla_b = [np.sum(dm) for dm in delta_maxp]

        

        #derive the kernel_nabla_w

        #!!this is the tricky part, refer to formulas for the convolutional layer !!       

        kernel_nabla_w = [convolve(image, dm) for dm in delta_maxp]

        overlap = np.array(image.shape) - np.array(delta_maxp[0].shape) + 1

        kernel_nabla_w = [kw[:overlap[0], :overlap[1]] for kw in kernel_nabla_w]

                          

        return (nabla_b, nabla_w, kernel_nabla_b, kernel_nabla_w)

    

    '''

    update the weights w and bias b using a mini batch of training samples

    calculate gradient for each sample (x,y) separately and then average the sum to approximate the actual gradient

    '''

    def update_mini_batch(self, mini_batch, eta, n, lmbda):

        #initialize the gradient of Cost w.r.t. biases and weight

        sum_nabla_b = [np.zeros(b.shape) for b in self.biases] 

        sum_nabla_w = [np.zeros(w.shape) for w in self.weights]

        sum_kernel_nabla_b =[0 for b in self.kernel_bias]

        sum_kernel_nabla_w =[np.zeros(w.shape) for w in self.kernel_weights]

        

        #iterate through every training sample in the mini batch

        #calculate the gradient for each of the samples and sum up

        for x, y in mini_batch: 

            #apply drop rate to get a weight filter

            #ignore the first input layer so drops_rates starts from 1.

            weight_filter=[] #filter on matrix columns. first layer has no weight

            layer_filter =[] #indicate which neurons are dropped per layer

            for dr, w, b in zip(self.drop_rates[1:], self.weights, self.biases):

                filter_rows = np.random.rand(w.shape[0]) #a list of random numbers between 0 and 1                                           

                filter_rows = np.where(filter_rows >= dr, 1, 0) #drop dr% of rows, the surving element is assigned 1, else 0

                filter_rows = filter_rows.reshape((len(filter_rows),1)) #convert to a column vector               

                layer_filter.append(filter_rows)

                weight_row_filter = np.repeat(filter_rows, w.shape[1], axis=1) #repeat the column vector so only the surviving row's columns have 1, else 0

                if len(layer_filter)>1 : #if there is a previous filter

                    #the previous weight's row filter is the current weight's column filter, 

                    #as a dropped neuron affects the incoming and outgoing weights at rows and columns respectively

                    filter_cols = layer_filter[-2].reshape((1,len(layer_filter[-2]))) #convert back to a matrix with one row [[1,2,3...]], note this is not a flatten row vector [1,2,3...]                    

                    weight_col_filter = np.repeat(filter_cols, w.shape[0], axis=0) #repeat the row 

                    weight_filter.append(weight_row_filter * weight_col_filter) #combine the two filters to form a matrix mask  

                else:

                    weight_filter.append(weight_row_filter) #there is no previous filter               

            #back propagation to calculate the gradient for sample x, y

            delta_nabla_b, delta_nabla_w, delta_kernel_nabla_b, delta_kernel_nabla_w = self.back_prop(x,y, weight_filter, layer_filter)

            

            #sum up the gradient calculated from each training sample and take the average (later) outside of the loop

            sum_nabla_b = [nb + dnb for nb, dnb in zip(sum_nabla_b, delta_nabla_b)] 

            sum_nabla_w = [nw + dnw for nw, dnw in zip(sum_nabla_w, delta_nabla_w)] 

            sum_kernel_nabla_b = [nb + dnb for nb, dnb in zip(sum_kernel_nabla_b, delta_kernel_nabla_b)]

            sum_kernel_nabla_w = [nw + dnw for nw, dnw in zip(sum_kernel_nabla_w, delta_kernel_nabla_w)]      

                         

        #take the average of the gradients of all samples.

        #the average here is considered as an estimate of the actual gradient

        avg_nabla_b = [dnb/len(mini_batch) for dnb in sum_nabla_b]

        avg_nabla_w = [dnw/len(mini_batch) for dnw in sum_nabla_w]      

        avg_kernel_nabla_b = [dnb/len(mini_batch) for dnb in sum_kernel_nabla_b]

        avg_kernel_nabla_w = [dnw/len(mini_batch) for dnw in sum_kernel_nabla_w]

        

        #adjust the weights and bias by moving a delta in the opposite direction of gradient.

        #lmbda/n * w is the derivative of the L2 penalty term

        #rescale the weights due to dropout, note, times the outgoing weight with drop rate, not the incoming weights

        self.weights = [ w - eta * nw * (1-dr) - eta * lmbda/n * w *(1-dr)   for w, nw, dr in zip(self.weights, avg_nabla_w, self.drop_rates[:-1])]        

        self.biases = [ b - eta * nb for b, nb in zip(self.biases, avg_nabla_b)]       

        #!rescaling biases seems not necessary!     

        

        #adjust kernel weights separately

        self.kernel_weights = [w - eta * nw - eta * lmbda/n * w for w, nw in zip(self.kernel_weights,avg_kernel_nabla_w)]

        self.kernel_bias = [b - eta * nb for b, nb in zip(self.kernel_bias, avg_kernel_nabla_b)]

        

        #print(self.kernel_weights)

    

    '''

    stochastic gradient descent sgd

    every round of training uses a random mini batch (subset) of training data.

    pre-divide the training data into mini batches and use them all.

    once all mini batches have been used, an epoch is done

    then shuffle the training data randomly and start another epoch

    the function stops when the specified number of epochs of training have been reached / performance not improving(todo).

    the test data is for tracking the performance of each epoch

    '''

    def sgd(self, training_data, epochs, mini_batch_size, eta, lmbda, test_data=None):      

        n = len(training_data)

           

        for i in range(epochs):

            #re-shuffle the training data every epoch

            random.shuffle(training_data) 

            #divide the training data set into mini batches as per the mini batch size

            mini_batches = [training_data[j:j+mini_batch_size] for j in range(0, len(training_data), mini_batch_size)]       

          

            #adjust weights and biases every mini batch

            for mini_batch in mini_batches:

                #update the weights w and biases b by moving one step 'eta' using a mini batch of data

                self.update_mini_batch(mini_batch, eta, n, lmbda) 

                

            #print out the current performance if test data available              

            if test_data:

                accuracy = self.evaluate(test_data)

                print("Epoch {0}: {1}     {2}".format(i, accuracy, datetime.now()))               

            else:

                print("Epoch {0} done".format(i))

                       

    

    def relu(self, z):

        return np.where(z > 0, z, 0)

    

    def relu_prime(self, z):

        return np.where(z > 0, 1, 0)

    

    '''

    the activation for the last layer  

    The C is for avoiding overflow

    '''

    def softmax(self, z):

        C = np.max(z)

        expz = np.exp(z - C)

        return expz / np.sum(expz)   

       

    '''

    test the current neural network with test_data and returns the % of correct predictions    

    '''

    def evaluate(self, test_data):

        #return the index of the largest output value for each test sample

        #the index is from 0 to 9 so index represents the actual recognised number

        test_results = [(np.argmax(self.feed_forward(x)), np.argmax(y)) for (x,y) in test_data]

        #return the % of correct predictions

        return sum(int(x==y) for (x,y) in test_results) / float(len(test_data))

        

def main():

    print('========== start loading data ==========')

    #load data from zip file

    file_path = os.path.dirname(os.path.realpath(__file__)) + '/mnist.pkl.gz'

    f = gzip.open(file_path, 'rb')

    u = pickle._Unpickler(f)

    u.encoding = 'latin1'

    training_data, validation_data, test_data = u.load()

    f.close()

    

    #reading data in

    training_inputs = [np.reshape(x, (28,28)) for x in training_data[0]]

    training_results = []

    for y in training_data[1]:

        vector = np.zeros((10,1))

        vector[y] = 1.0

        training_results.append(vector) #vectorize y, so it can be used in back propagation for matrix multiplication.

    tr_data = list(zip(training_inputs, training_results)) #in python3 needs to convert zip to list

    

    test_inputs = [np.reshape(x, (28,28)) for x in test_data[0]]

    test_results = []

    for y in test_data[1]:

        vector = np.zeros((10,1))

        vector[y] = 1.0

        test_results.append(vector)

    te_data = list(zip(test_inputs, test_results))

    print('========== finish loading data ==========')

    print('========== start training {0} =========='.format(datetime.now()))

    

    #nn = MNISTLearner(sizes = [784, 100, 10]) 

    nn = MNISTLearner()       

    nn.sgd(tr_data, epochs=20, mini_batch_size=20, eta = 0.1, lmbda = 0.1, test_data = te_data)

    

    

    print('========== training is done ==========')

    #pickle.dump(nn, open('c:\\temp\\mnist_learner.pkl', 'wb')) #save to a file

    

if __name__ == '__main__':

    main()