Machine Learning

Hyperparameters

Lower values of regularization parameter make the model more conservative and resilient to noise, while higher values give it more flexibility and capacity to capture the nuances. If you increase it too much, you risk overfitting.

Supervised Learning

Numeric: Regression (Linear, Lasso, Ridge), ARIMA
- from sklearn.linear_model import (LinearRegression, Lasso, Ridge)
- from statsmodels.tsa.arima_model import ARIMA

Linear Regression

from sklearn.preprocessing import PolynomialFeatures
#poly = PolynomialFeatures(degree = #2)
#X2_train = #poly.fit_transform(#df) #Apply polynomial transformations
#model.fit(#X2_train, y_train)

Categorical: Classification (Logistic regression, Bayesian classification, Decision Tree, Random Forest, Gradient Boosted Trees)
- from sklearn.linear_model import LogisticRegression
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
  - model = RandomForestClassifier(n_estimators = #trees, max_depth = #depth, verbose = #1 to show progress)
- from sklearn.svm import SVC #Support Vector Machine

Logistic Regression

from sklearn.linear_model import LogisticRegression
#model = LogisticRegression(random_state = 1) #random_state 1 for reproducibility
#model.fit(#X_train, #y_train)

Decision Tree

Sequence of if-else about individual features (non-linear relationship between features and labels) and no need feature scaling (standardization)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = #y, random_state = 1)
#model = DecisionTreeClassifier(max_depth = #2, random_state = #1, criterion='entropy') #Select the model appropriate for the task, #random_state 1 for reproducibility, #gini or entropy - gini is faster and usually same results
#model.fit(X = #X_train, y = #y_train) #Train the model
#y_pred = #model.predict(X = #X_test) #Generate predictions
#acc = accuracy_score(y_true = #y_test, y_pred = #y_pred) #Test the model
evaluate_predictions(y_true = #y_test, y_pred = #y_pred) #Test the model
print("Test set accuracy: {:.2f}".format(#acc))

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
#model = DecisionTreeRegressor(max_depth = #4, min_samples_leaf = 0.1, random_state = 3) #min_leaf: 10% of training data in each leaf
#model.fit(#X_train, #y_train)
#y_pred = #model.predict(#X_test)
#mse = MSE(#y_test, #y_pred)
#rmse = #mse**(1/2)
print("Test set RMSE of dt: {:.2f}".format(#rmse))

Overfitting: Model fits training set noise (High variance)
Underfitting: Not flexible enough to approximate to actual function (High bias model - less accurate)
Generalization error (how well model generalize on unseen data) = bias^2 (less accurate: how model differs from original function) + variance (less precise: how much model is inconsistent over different training sets) + irreducible error (error contribution of noise) #Lowest generalization error is the minimum turning point
Model complexity increases with increasing max tree depth: variance increases (less precise), bias decreases (more accurate)

K-fold cross-validation

High variance: cross-validation error > training set error, overfitting has occurred. Therefore, decrease model complexity (e.g. decrease max tree depth, increase min samples per leaf, increase sample size)
High bias: cross-validation error ~ training set error >> desired error, underfitting has occurred. Therefore, increase model complexity (increase max tree depth, decrease min samples per leaf, get more relevant features)

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score
#SEED = 123 #reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = #SEED)
#model = DecisionTreeRegressor(max_depth = #4, min_samples_leaf = 0.1, random_state = 3) #min_leaf: 10% of training data in each leaf
#mse_cv = - cross_val_score(#model, X_train, y_train, cv = #10, scoring = "neg_mean_squared_error', n_jobs = -1) #n_jobs -1 to exploit all available CPUs in computation
#model.fit(#X_train, #y_train)
#y_pred_train = #model.predict(#X_train)
#y_pred_test = #model.predict(#X_test)
#mse = MSE(#y_test, #y_pred)
#rmse_cv = (mse_cv.mean())**(1/2)
print("Test set MSE of dt: {:.2f}".format(#mse_cv.mean()))
print("Test set MSE of dt: {:.2f}".format(#MSE(y_train, y_pred_train))
print("Test set MSE of dt: {:.2f}".format(#MSE(y_test, y_pred_test))

Classification and Regression Trees (CART)

+ Simple to understand, interpret, use
+ Flexibility: ability to describe non-linear dependencies
+ No need to standardize or normalize features
- Can only produce orthogonal decision boundaries
- Sensitive to small variations in the training set
- High variance: unconstrained CARTs may overfit training set

Ensemble model (e.g Hard voting classifier)

#SEED = #1 #Set seed for reproducibility
#lr = LogisticRegression(random_state = #SEED)
#knn = KNeighborsClassifier(n_neighbors = #27)
#dt = DecisionTreeClassifier(min_samples_leaf = #0.13, random_state = #SEED)

#classifiers = #[('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]
for #clf_name, #clf in #classifiers:
#clf.fit(#X_train, #y_train)
#y_pred = #clf.predict(#X_test)
#accuracy = accuracy_score(#y_test, #y_pred)
print('{:s} : {:.3f}'.format(clf_name, #accuracy))

from sklearn.ensemble import VotingClassifier (same training set, different algorithms, majority voting)
#vc = VotingClassifier(estimators = #classifiers)
#vc.fit(#X_train, #y_train)
y_pred = vc.predict(X_test)
#accuracy = accuracy_score(#y_test, #y_pred)
print('Voting Classifier: {:.3f}'.format(#accuracy))

Bagging

Bootstrap aggregation (subset of training set, same algorithms to reduce variance of individual models; bootstrap sample: sample with replacement)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier #majority voting
#model = DecisionTreeClassifier(random_state = 1)
#bc = BaggingClassifier(base_estimator = #model, n_estimators = #300 trees, n_jobs = #-1)
#bc.fit(#X_train, #y_train)
#y_pred = #bc.predict(#X_test)
#acc_test = accuracy_score(#y_test, #y_pred)
print('Test set accuracy of bc: {:.2f}'.format(#acc_test))

Out-of-bag evaluation

Not used in sampling, hence can be used for estimate performance of ensemble without cross-validation

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
#model = DecisionTreeClassifier(min_samples_leaf = #8, random_state = #1)
#bc = BaggingClassifier(base_estimator = #model, n_estimators = #50, oob_score = #True, random_state = #1) #oob_score true to evaluate OOB accuracy (accuracy or R^2) after training
#bc.fit(#X_train, #y_train)
#y_pred = #bc.predict(#X_test)
#acc_test = accuracy_score(#y_test, #y_pred)
#acc_oob = #bc.oob_score_
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(#acc_test, #acc_oob))

from sklearn.ensemble import BaggingRegressor #averaging

Random Forest

Randomization whereby subset of features are sampled without replacement at each node; node split using sampled feature that maximizes information gain; different bootstrap samples with same size as training set; lower variance

from sklearn.ensemble import RandomForestRegressor
#SEED = 1
from sklearn.model_selection import GridSearchCV
#params_rf = #{'n_estimators': [100, 350, 500], 'min_samples_leaf': [2, 10, 30], 'max_features': ['log2', 'auto', 'sqrt']}
#grid_rf = GridSearchCV(estimator = #model, param_grid = #params_rf, cv = #3, scoring = #"neg_mean_squared_error', verbose = #1, n_jobs = #-1)
#best_hyperparams = #grid_rf.best_params_
print('Best hyperparameters:\n', #best_hyperparams)

#model = RandomForestRegressor(n_estimators = #25, random_state = #SEED)
#model.get_params() #Get hyperparameter if necessary
#grid_rf.fit(#X_train, #y_train)
from sklearn.metrics import mean_squared_error as MSE
#best_model = #grid_rf.best_estimator_
#y_pred = #model.predict(#X_test)
#rmse_test = MSE(#y_test, #y_pred)**(#1/2)
print('Test set RMSE of rf: {:.2f}'.format(#rmse_test))

import matplotlib.pyplot as plt
import pandas as pd
#importances = pd.Series(data = #model.feature_importances_, index = #X_train.columns)
#importances_sorted = #importances.sort_values()
#importances_sorted.plot(kind = #'barh', color = #'lightgreen')
plt.title('Features Importances')
plt.show()

Boosting

Ensemble in which predictors are trained sequentially and learn from errors of predecessors.

Many weak learners (e.g. max_depth = 1) are combined to form strong learners.

Adaptive boosting

Prediction error of previous prediction (e.g. predictor 1) is used to determine coefficient alpha of predictor 1, which is the weight of training instances for predictor 2.
Incorrectly predicted instances have higher weights
Learning rate (between 0 and 1) to normalize the coefficient alpha
Smaller learning rate should be compensated with larger number of estimators

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
#model = DecisionTreeClassifier(max_depth = #2, random_state = #1)
#ada = AdaBoostClassifier(base_estimator = #model, n_estimators = #180 trees, random_state = #1)
ada.fit(X_train, y_train)
#y_pred_proba = #ada.predict_proba(#X_test)[:, 1] #Compute the probabilities of obtaining the positive class
from sklearn.metrics import roc_auc_score
#ada_roc_auc = roc_auc_score(#y_test, #y_pred_proba)
print('ROC AUC score: {:.2f}'.format(#ada_roc_auc))

Gradient boosting

Prediction error of previous prediction (e.g. predictor 1) is used to correct predecessor's error (does not change weights of training instances).
Each predictor is trained using residual errors of predecessor as labels (Y-hat): Predictor 2 is trained using features and residual errors e1 as labels. Therefore, predicted residuals e1-hat are used to determine residuals of residuals, i.e. e2.
Shrinkage: Prediction of each tree after multiplying it by learning rate
Smaller learning rate should be compensated with larger number of estimators

from sklearn.ensemble import GradientBoostingRegressor
#model = GradientBoostingRegressor(max_depth = #4, n_estimators = #200, random_state = #2)
#model.fit(#X_train, #y_train)
#y_pred = #model.predict(#X_test)
from sklearn.metrics import mean_squared_error as MSE
#mse = MSE(#y_test, #y_pred)
#rmse = #mse**(1/2)
print('Test set RMSE of gb: {:.3f}'.format(#rmse))

Stochastic Gradient Boosting

Exhaustive search procedure: Increase ensemble diversity with greater variance
Each tree trained on random subset of rows of training data without replacement
Not all features are used: Features sampled without replacement at each node to choose best split-points
Residual errors are multiplied by learning rate and used in next tree

from sklearn.ensemble import GradientBoostingRegressor
#sgbr = GradientBoostingRegressor(max_depth = #4, subsample = #0.9, max_features = #0.75, n_estimators = #200, random_state = #2)
#sgbr.fit(#X_train, #y_train)
#y_pred = #sgbr.predict(#X_test)
from sklearn.metrics import mean_squared_error as MSE
#mse = MSE(#y_test, #y_pred)
#rmse = #mse**(1/2)
print('Test set RMSE of sgbr: {:.3f}'.format(#rmse))

Hyperparameters

Parameters: Learned from data through training (e.g. split-point, split-feature)
Hyperparameters: Not learned from data, Set prior to training (e.g. max_depth, min_samples_leaf
- Grid Search
- Random Search
- Bayesian Optimization
- Genetic Algorithms

Grid Search

#params_dt = {#'max_depth': [2, 3, 4], #'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]} #Define params_dt
from sklearn.model_selection import GridSearchCV
#grid_dt = GridSearchCV(estimator = #dt, param_grid = #params_dt, scoring = #'roc_auc', cv = #5, n_jobs = #-1)
#grid_dt.fit(#X_train, #y_train)
#best_hyperparams = #grid_dt.best_params_
#best_CV_score = #grid_dt.best_score_
#best_model = #grid_dt.best_estimator_
#test_acc = #best_model.score(#X_test, #y_test)
from sklearn.metrics import roc_auc_score
#best_model = #grid_dt.best_estimator_
#y_pred_proba = #best_model.predict_proba(#X_test)[:,1]
#test_roc_auc = roc_auc_score(#y_test, #y_pred_proba)
print('Test set ROC AUC score: {:.3f}'.format(#test_roc_auc))

Unsupervised Learning

Clustering
- from sklearn.cluster import KMeans #Linear boundaries, Number of boundaries to be specified
  - #model = KMeans(n_clusters = #2)
  - #pred_cluster = #model.fit_predict(#Xdf)
- from sklearn.cluster import SpectralClustering #Non-linear boundaries, Number of boundaries to be specified
- from sklearn.cluster import MeanShift #Mean-shift clustering
- from sklearn.cluster import DBSCAN #Linear and non-linear boundaries, Determines number of clusters, BUT some points may be left out from clusters or clusters are merged with overlapping regions
  - dbscan.set_params(eps = #2) #eps hyperparameter
  - #clusters = dbscan.fit_predict(#X)
  - plot_clusters(#X, #clusters)

Elbow method: Sum of squared of distances VS number of clusters --> How far each point is to the centre of cluster
- #k_range = list(range(#2, #6))
- summed_distances = []
- for k in k_range:
- kmeans.set_params(n_clusters=k).fit(X)
- summed_distances.append(kmeans.inertia_)
- plot_elbow_curve(k_range, summed_distances)

sklearn.metrics.calinski_harabaz_score #Unsupervised tuning: (Variance Ratio Criterion) Average distance to centre to cluster AND distance between clusters
sklearn.metrics.silhouette_score #Unsupervised tuning: How close to cluster VS how close it is to others
sklearn.metrics.mutual_info_score #Supervised tuning
sklearn.metrics.homogeneity_score #Supervised tuning

Anomaly detection (Thresholding, Rate of change, Shape monitoring) - Use supervised learning metric CF to test
- from sklearn.ensemble import IsolationForest #Computationally demanding
  - #model = IsolationForest()
  - #model.fit(#X)
  - #results = #model.predict(#X)
- from sklearn.covariance import EllipticEnvelope #Robust covariance: Assumes normal distribution
- klearn.svm import OneClassSVM #No need normality, BUT Sensitive to outliers with many false negatives

Dimensionality reduction (e.g. reduce overfitting, speed up computation, or visualize data in 2D BUT compression loses information)
- from sklearn.decomposition import PCA #Linear deterministic (Consistent results but need normally distributed data and sensitive to outliers)
  - #pca = PCA(n_dimensions = #3)
  - #X_reduced = #pca.fit_transform(#Xdf)
  - #X_2D, #outliers_2D = extract_components(#X_new, #outliers, #n_components=2) #Extract principal components
  - #plot_2d_data(#X_2D, #outliers_2D)
- from sklearn.decomposition import LatentDirichletAllocation #Linear deterministic LDA
- from sklearn.manifold import Isomap #Non-linear non-deterministic Manifold learning
- from sklearn.manifold import TSNE #Non-linear non-deterministic t-distributed Stochastic Neighbour Embedding (t-SNE)

Feature selection

Select subset of existing features without transformation

Feature extraction

Transforming and combining existing features into new ones

Neural Networks

Build representations of complex patterns in data for prediction.
Deep (Representation) learning: Subsequent layers build sophisticated representations of raw data; Partially replaces need for feature engineering

Deep Learning Neural Network: More than 4 layers (1 input, 1 hidden, 1 output)

Feedforward: Not suitable when sequence of appearance is important (e.g. text, sound, time series)
Recurrent: Take current and past input --> Suitable when sequence of appearance (time) is important (e.g. speech recognition)
Convolutional (Transformation of signals when passing through systems of different characteristics / Filtering): Feedforward network that handles multidimensional data when patterns across space is important (e.g. images, videos, text)

Sample codes

Sample Code #1: Keras for regression models

import keras
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.models import Sequential

#n_cols = predictors.shape[1]
#model = Sequential()
#model.add(Dense(#50, activation = #'relu', input_shape = #(n_cols,) ))
#model.add(Dense(#32, activation = #'relu'))
#model.add(Dense(#1))
#model.compile(optimizer = #'adam', loss = #'mean_squared_error') #MSE for regression
#model.fit(#predictors, #target)

Sample Code #2: TensorFlow for classification

import tensorflow.keras.models import Sequential
import tensorflow.keras.models
import (Dense, Conv2D, MaxPooling2D, Flatten)
- tensorflow.keras.layers.Dense #1D feature extraction and signal transformation
- tensorflow.keras.layers.Conv2D #ND, shift-invariant feature extraction and signal transformation
- tensorflow.keras.layers.Dropout #Prevent overfitting by randomly switching off preceding nodes
- tensorflow.keras.layers.MaxPooling2D #Prevent overfitting by sub-sampling (pooling) whereby dimensionality of data are reduced by aggregation (similar to reduce resolution)
- tensorflow.keras.layers.Flatten #Convert ND to 1D signals

model = Sequential()
model.add(Dense(units = #32, input_dim = #64, activation = 'relu')) #Fully connected hidden layer with number of neurons, input number of features and activation function (non-linear: sigmoid or rectified linear function)
model.add(Dense(units = #3, activation = 'softmax')) #Number of output classification categories #softmax ensures predictions sum up to 1 so that they can be interpreted as probabilities
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy']) #categorical_crossentropy loss function is similar to log loss, whereby lower is better

Sample Code #3: TensorFlow for image classification with evaluation

#model = Sequential()
#model.add(Conv2D(filters = #64, units = #32, kernel_size = #(3, 3), input_shape = #(28, 28, 1), activation = 'relu')) #input_shape(#n_cols, #n_rows): size of original image; #kernel_size: filter (kernel) size of original image
#model.add(MaxPooling2D(pool_size = #(2, 2), strides = #(2, 2))
#model.add(Flatten())
#model.add(Dense(#10, activation = #'softmax')) #Number of output classification categories
#model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

def make_deep_net(input_shape, n_output_classes, n_kernels=32, kernel_size=(3,3)):
#model = Sequential() #Initialize the sequential model
#model.add(Conv2D(input_shape = #input_shape, filters = #n_kernels, kernel_size = #kernel_size, activation = #'relu')) #Add the convolutional layer (containing implicitly the input layer)
model.add(Flatten()) #Add the flattening layer
model.add(Dense(#n_output_classes, activation = #'softmax')) #Add the fully connected layer
model.compile(optimizer = #"adam", metrics = #["accuracy"], loss = 'categorical_crossentropy') #Compile the model
return model

def evaluate_deep_net(model, x_test, y_test):
#score = #model.evaluate(x = x_test, y = y_test) #Generate the test predictions and evaluate against the ground truth
print('Test loss: %.2f' % #score[0]) #Print the evaluation results in a human readable form
print('Test accuracy: %.2f %%' % (100*#score[1]))

deep_net = make_deep_net(input_shape = #[28, 28, 1], n_output_classes = #10) #Construct the Deep Neural Network
deep_net.fit(x = x_train, y = y_train, validation_data = (x_test, y_test), batch_size = #128, epochs = #3) #Train the Deep Neural Network
evaluate_deep_net(deep_net, x = x_test, y = y_test) #Estimate the network performance

Train/Test Split

from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
- model.fit(X_train, y_train)
- model.predict(X=X_test) or model.predict(X=['#colname'])

from sklearn.metrics import confusion_matrix
- confusion_matrix(y_true, y_predicted)

from sklearn.metrics import (accuracy_score, precision_score, recall_score)
- accuracy_score(y_true, y_predicted)
- print_metrics(target_test = #y_test, predictions = #predictions_test, metrics = ['accuracy', 'precision', 'recall'])

from sklearn.metrics import (mean_absolute_error, median_absolute_error, r2_score)
- r2_score(y_true, y_predicted)

#score = #model.evaluate(x = #X_test, y = #y_test)
#score[0] #loss
#score[1] #accuracy

False positive (Type 1) - false alarm
False negative (Type 2) - misdiscovery
Accuracy: TP
Precision: 1 - FP (How many P are TP)
Recall: 1 - FN (How many TP are detected)

Errors - normally distributed --> use Mean Absolute Error, if not, use Median Absolute Error
Local Interpretable Model-Agnostic Explanations (LIME) algorithm #Local explanation

TensorFlow

Tensor: Collection of numbers, arranged into particular shape
- tf.constant([1, 2, 3])
- tf.zeros([2, 2])
- tf.zeros_like(input_tensor)
- tf.ones([2, 2])
- tf.ones_like(input_tensor)
- tf.fill([3, 3], 7)
multiply() #element-wise multiplication
matmul() #matrix multiplication
reduce_sum(#df, #0 for col, 1 for row).numpy() #col or row addition
gradient() #Change in gradient: <0 max (from positive to negative), >0 == min (from negative to positive)
- def compute_gradient(x0):
- x = Variable(x0)
- with GradientTape() as tape:
- tape.watch(x)
- y = multiply(x, x)
- return tape.gradient(y, x).numpy()
- print(compute_gradient(#-1.0))
reshape(#tensor, (#8, #6))

Activation function

Sigmoid activation function (binary classification)
- tf.keras.activations.sigmoid()
- sigmoid()
ReLu (rectified linear unit) (used in all layers except output)
- tf.keras.activations.relu()
- relu()
Softmax (used in output layer with more than 2 classes)
- tf.keras.activations.softmax()
- softmax()

Optimizer

Stochastic gradient descent optimizer (easy to interpret)
- from keras.optimizers import SGD
- #lr_to_test = [#0.000001, #0.01, #1] #Create list of learning rates: lr_to_test
- for #lr in #lr_to_test: #Loop over learning rates
- print('\n\nTesting model with learning rate: %f\n'%#lr )
- #model = get_new_model()
- #my_optimizer = SGD(#lr = #lr)
- model.compile(optimizer = #my_optimizer, loss = #'categorical_crossentropy')
- model.fit(#predictors, #target)
Root mean squared propagation optimizer (different learning rates for each feature - useful for high dimension problems; allow to both build momentum and allow it to decay)
- tf.keras.optimizers.RMSprop()
  - #x_1 = Variable(#0.05, float32)
  - #opt_1 = keras.optimizers.RMSprop(learning_rate = #0.01, momentum = #0.99)
  - for j in range(100):
  - #opt_1.minimize(lambda: loss_function(#x_1), var_list = [#x_1])
  - print(#x_1.numpy())
Adaptive moment (adam) optimizer (similar to RMS prop; generally performs well with default; can set momentum to decay faster by lowering beta1 parameter)
- tf.keras.optimizers.Adam()

Loss function

Greater sensitivity near minimum: Use MSE or Huber loss
Minimize impact of outliers: Use MAE or Huber loss
MSE: Strongly penalizes outliers; High sensitivity near minimum
- tf.keras.losses.mse()
MAE: Scale linearly with size of error; Low sensitivity near minimum
- tf.keras.losses.mae()
Huber: Similar to MSE near minimum; Similar to MAE away from minimum
- tf.keras.losses.Haber()

Batch size VS Epoch

Batch size: Number of samples in each batch (default: 32)
- opt = keras.optimizers.Adam() #Initialize adam optimizer

for batch in pd.read_csv('kc_house_data.csv', chunksize=100): #Load data in batches
size_batch = np.array(batch['sqft_lot'], np.float32)
price_batch = np.array(batch['price'], np.float32)
opt.minimize(lambda: loss_function(intercept, slope, price_batch, size_batch), var_list=[intercept, slope])

print(intercept.numpy(), slope.numpy())

Epoch: Number of times training full set of batches (Revisit same batches with different weights and optimizer parameters)

Save, load Model

from keras.models import load_model
#model.save('#file.h5')

#model = load_model('#file.h5)
#pred = #model.predict(#x_test)
#prob = predictions[:, 1]
#model.summary()

Sample codes

Sample code #1: Typical neural network

#inputs = constant(#inputfeatures)
#dense1 = keras.layers.Dense(#7, activation = #'sigmoid')(#inputs) #7 nodes
#dense2 = keras.layers.Dense(#3, activation = #'relu')(#dense1) #3 nodes
#output = keras.layers.Dense(#1, activation = #'softmax')(#dense2) #1 output node
print(#output.numpy()[:5])

Sample code #2: Add weights and bias

w1 = Variable(random.normal([#23, #7])) #Define the layer 1 weights
b1 = Variable(ones([#7])) #Initialize the layer 1 bias
w2 = Variable(random.normal([#7, #1]))
b2 = Variable(#0.0)

def model(#w1, #b1, #w2, #b2, #features = features): #Define the model
#layer1 = keras.activations.relu(matmul(#features, #w1) + #b1) #Apply relu activation functions to layer 1
#dropout = keras.layers.Dropout(#0.25)(#layer1) #Apply dropout of 25%
return keras.activations.sigmoid(matmul(#dropout, #w2) + #b2)

def loss_function(w1, b1, w2, b2, features = #features, targets = #default): #Define the loss function
#predictions = model(#w1, #b1, #w2, #b2)
return keras.losses.binary_crossentropy(#targets, #predictions) #Pass targets and predictions to the cross entropy loss

for j in range(#100): #Train the model
opt.minimize(lambda: loss_function(#w1, #b1, #w2, #b2), #Complete the optimizer
var_list=[#w1, #b1, #w2, #b2])
model_predictions = model(#w1, #b1, #w2, #b2, #test_features) #Make predictions with model
confusion_matrix(#test_targets, #model_predictions) #Construct the confusion matrix

Sample code #3: Dropout

#model.add(keras.layers.Dense(#16, activation = #'sigmoid', input_shape = (#784,))) #Define the first dense layer
#model.add(keras.layers.Dropout(#0.25)) #Apply dropout to the first layer's output
#model.add(keras.layers.Dense(#4, activation = #'softmax')) #Define the output layer
#model.compile(#'adam', loss = #'categorical_crossentropy') #Compile the model
print(#model.summary()) #Print a model summary

Sample code #4: Multiple input model

#m1_layer1 = keras.layers.Dense(#12, activation = #'sigmoid')(#m1_inputs)
#m1_layer2 = keras.layers.Dense(#4, activation = #'softmax')(#m1_layer1)

#m2_layer1 = keras.layers.Dense(#12, activation = #'relu')(#m2_inputs)
#m2_layer2 = keras.layers.Dense(#4, activation = #'softmax')(#m2_layer1)

#merged = keras.layers.add([#m1_layer2, #m2_layer2]) #Merge model outputs and define a functional model
#model = keras.Model(inputs = [#m1_inputs, #m2_inputs], outputs = #merged)
print(#model.summary()) #Print a model summary

Sample code #5: Model Evaluation with Validation and early stopping

from keras.callbacks import EarlyStopping
#model = keras.Sequential() #Define sequential model
#model.add(keras.layers.Dense(#32, activation = #'sigmoid', input_shape = #(784,)))
#model.add(keras.layers.Dense(#4, activation = #'softmax'))
#model.compile(optimizer = #keras.optimizers.Adam(lr=0.01), loss = #'categorical_crossentropy', metrics = #['accuracy'])
early_stopping_monitor = EarlyStopping(patience = 3) #Stop training after 3 epochs when there is no improvement in validation score
#model.fit(#features, #labels, epochs = #20, validation_split = #0.1, callbacks = [early_stopping_monitor], verbose = False) #verbose False to print fewer updates
#model.evaluate(#test_features, #test_labels)
plt.plot(#model_1_training.history['#val_loss'], #'r', #model_2_training.history['#val_loss'], #'b') #r for red colour, #b for blue colour
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()

Sample code #6: Training with estimators

from tensorflow import feature_column
import numpy as np
import pandas as pd

#bedrooms = feature_column.numeric_column("#bedrooms") #Define feature columns for bedrooms and bathrooms
#bathrooms = feature_column.numeric_column("#bathrooms")
feature_list = [#bedrooms, #bathrooms] #Define the list of feature columns

def input_fn():
labels = np.array(housing['#price'])
features = {'#bedrooms':np.array(housing['#bedrooms']), '#bathrooms':np.array(housing['#bathrooms'])}
return features, labels

Sample code #7: Letter prediction

model = reshape(model, (3, 1))
output = matmul(letter, model)
prediction = reduce_sum(output, 0)
print(prediction.numpy())

Sample code #8: Linear regression

def linear_regression(params, feature1 = size_log, feature2 = bedrooms):
return params[0] + feature1*params[1] + feature2*params[2] #params[0]: intercept, params[1] and [2]: slopes

def loss_function(params, targets = price_log, feature1 = size_log, feature2 = bedrooms):
predictions = linear_regression(params, feature1, feature2)
return keras.losses.mae(targets, predictions)