- Numeric: Regression (Linear, Lasso, Ridge), ARIMA
- from sklearn.linear_model import (LinearRegression, Lasso, Ridge)
- from statsmodels.tsa.arima_model import ARIMA
- from sklearn.preprocessing import PolynomialFeatures
- #poly = PolynomialFeatures(degree = #2)
- #X2_train = #poly.fit_transform(#df) #Apply polynomial transformations
- #model.fit(#X2_train, y_train)
- Categorical: Classification (Logistic regression, Bayesian classification, Decision Tree, Random Forest, Gradient Boosted Trees)
- from sklearn.linear_model import LogisticRegression
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- model = RandomForestClassifier(n_estimators = #trees, max_depth = #depth, verbose = #1 to show progress)
- from sklearn.svm import SVC #Support Vector Machine
- from sklearn.linear_model import LogisticRegression
- #model = LogisticRegression(random_state = 1) #random_state 1 for reproducibility
- #model.fit(#X_train, #y_train)
- Sequence of if-else about individual features (non-linear relationship between features and labels) and no need feature scaling (standardization)
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = #y, random_state = 1)
- #model = DecisionTreeClassifier(max_depth = #2, random_state = #1, criterion='entropy') #Select the model appropriate for the task, #random_state 1 for reproducibility, #gini or entropy - gini is faster and usually same results
- #model.fit(X = #X_train, y = #y_train) #Train the model
- #y_pred = #model.predict(X = #X_test) #Generate predictions
- #acc = accuracy_score(y_true = #y_test, y_pred = #y_pred) #Test the model
- evaluate_predictions(y_true = #y_test, y_pred = #y_pred) #Test the model
- print("Test set accuracy: {:.2f}".format(#acc))
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error as MSE
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
- #model = DecisionTreeRegressor(max_depth = #4, min_samples_leaf = 0.1, random_state = 3) #min_leaf: 10% of training data in each leaf
- #model.fit(#X_train, #y_train)
- #y_pred = #model.predict(#X_test)
- #mse = MSE(#y_test, #y_pred)
- #rmse = #mse**(1/2)
- print("Test set RMSE of dt: {:.2f}".format(#rmse))
- Overfitting: Model fits training set noise (High variance)
- Underfitting: Not flexible enough to approximate to actual function (High bias model - less accurate)
- Generalization error (how well model generalize on unseen data) = bias^2 (less accurate: how model differs from original function) + variance (less precise: how much model is inconsistent over different training sets) + irreducible error (error contribution of noise) #Lowest generalization error is the minimum turning point
- Model complexity increases with increasing max tree depth: variance increases (less precise), bias decreases (more accurate)
- High variance: cross-validation error > training set error, overfitting has occurred. Therefore, decrease model complexity (e.g. decrease max tree depth, increase min samples per leaf, increase sample size)
- High bias: cross-validation error ~ training set error >> desired error, underfitting has occurred. Therefore, increase model complexity (increase max tree depth, decrease min samples per leaf, get more relevant features)
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error as MSE
- from sklearn.model_selection import cross_val_score
- #SEED = 123 #reproducibility
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = #SEED)
- #model = DecisionTreeRegressor(max_depth = #4, min_samples_leaf = 0.1, random_state = 3) #min_leaf: 10% of training data in each leaf
- #mse_cv = - cross_val_score(#model, X_train, y_train, cv = #10, scoring = "neg_mean_squared_error', n_jobs = -1) #n_jobs -1 to exploit all available CPUs in computation
- #model.fit(#X_train, #y_train)
- #y_pred_train = #model.predict(#X_train)
- #y_pred_test = #model.predict(#X_test)
- #mse = MSE(#y_test, #y_pred)
- #rmse_cv = (mse_cv.mean())**(1/2)
- print("Test set MSE of dt: {:.2f}".format(#mse_cv.mean()))
- print("Test set MSE of dt: {:.2f}".format(#MSE(y_train, y_pred_train))
- print("Test set MSE of dt: {:.2f}".format(#MSE(y_test, y_pred_test))
Classification and Regression Trees (CART)
- + Simple to understand, interpret, use
- + Flexibility: ability to describe non-linear dependencies
- + No need to standardize or normalize features
- - Can only produce orthogonal decision boundaries
- - Sensitive to small variations in the training set
- - High variance: unconstrained CARTs may overfit training set
Ensemble model (e.g Hard voting classifier)
- #SEED = #1 #Set seed for reproducibility
- #lr = LogisticRegression(random_state = #SEED)
- #knn = KNeighborsClassifier(n_neighbors = #27)
- #dt = DecisionTreeClassifier(min_samples_leaf = #0.13, random_state = #SEED)
- #classifiers = #[('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]
- for #clf_name, #clf in #classifiers:
- #clf.fit(#X_train, #y_train)
- #y_pred = #clf.predict(#X_test)
- #accuracy = accuracy_score(#y_test, #y_pred)
- print('{:s} : {:.3f}'.format(clf_name, #accuracy))
- from sklearn.ensemble import VotingClassifier (same training set, different algorithms, majority voting)
- #vc = VotingClassifier(estimators = #classifiers)
- #vc.fit(#X_train, #y_train)
- y_pred = vc.predict(X_test)
- #accuracy = accuracy_score(#y_test, #y_pred)
- print('Voting Classifier: {:.3f}'.format(#accuracy))
Bootstrap aggregation (subset of training set, same algorithms to reduce variance of individual models; bootstrap sample: sample with replacement)
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import BaggingClassifier #majority voting
- #model = DecisionTreeClassifier(random_state = 1)
- #bc = BaggingClassifier(base_estimator = #model, n_estimators = #300 trees, n_jobs = #-1)
- #bc.fit(#X_train, #y_train)
- #y_pred = #bc.predict(#X_test)
- #acc_test = accuracy_score(#y_test, #y_pred)
- print('Test set accuracy of bc: {:.2f}'.format(#acc_test))
Not used in sampling, hence can be used for estimate performance of ensemble without cross-validation
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import BaggingClassifier
- #model = DecisionTreeClassifier(min_samples_leaf = #8, random_state = #1)
- #bc = BaggingClassifier(base_estimator = #model, n_estimators = #50, oob_score = #True, random_state = #1) #oob_score true to evaluate OOB accuracy (accuracy or R^2) after training
- #bc.fit(#X_train, #y_train)
- #y_pred = #bc.predict(#X_test)
- #acc_test = accuracy_score(#y_test, #y_pred)
- #acc_oob = #bc.oob_score_
- print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(#acc_test, #acc_oob))
- from sklearn.ensemble import BaggingRegressor #averaging
Randomization whereby subset of features are sampled without replacement at each node; node split using sampled feature that maximizes information gain; different bootstrap samples with same size as training set; lower variance
- from sklearn.ensemble import RandomForestRegressor
- #SEED = 1
- from sklearn.model_selection import GridSearchCV
- #params_rf = #{'n_estimators': [100, 350, 500], 'min_samples_leaf': [2, 10, 30], 'max_features': ['log2', 'auto', 'sqrt']}
- #grid_rf = GridSearchCV(estimator = #model, param_grid = #params_rf, cv = #3, scoring = #"neg_mean_squared_error', verbose = #1, n_jobs = #-1)
- #best_hyperparams = #grid_rf.best_params_
- print('Best hyperparameters:\n', #best_hyperparams)
- #model = RandomForestRegressor(n_estimators = #25, random_state = #SEED)
- #model.get_params() #Get hyperparameter if necessary
- #grid_rf.fit(#X_train, #y_train)
- from sklearn.metrics import mean_squared_error as MSE
- #best_model = #grid_rf.best_estimator_
- #y_pred = #model.predict(#X_test)
- #rmse_test = MSE(#y_test, #y_pred)**(#1/2)
- print('Test set RMSE of rf: {:.2f}'.format(#rmse_test))
- import matplotlib.pyplot as plt
- import pandas as pd
- #importances = pd.Series(data = #model.feature_importances_, index = #X_train.columns)
- #importances_sorted = #importances.sort_values()
- #importances_sorted.plot(kind = #'barh', color = #'lightgreen')
- plt.title('Features Importances')
- plt.show()
Ensemble in which predictors are trained sequentially and learn from errors of predecessors.
Many weak learners (e.g. max_depth = 1) are combined to form strong learners.
Adaptive boosting
- Prediction error of previous prediction (e.g. predictor 1) is used to determine coefficient alpha of predictor 1, which is the weight of training instances for predictor 2.
- Incorrectly predicted instances have higher weights
- Learning rate (between 0 and 1) to normalize the coefficient alpha
- Smaller learning rate should be compensated with larger number of estimators
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import AdaBoostClassifier
- #model = DecisionTreeClassifier(max_depth = #2, random_state = #1)
- #ada = AdaBoostClassifier(base_estimator = #model, n_estimators = #180 trees, random_state = #1)
- ada.fit(X_train, y_train)
- #y_pred_proba = #ada.predict_proba(#X_test)[:, 1] #Compute the probabilities of obtaining the positive class
- from sklearn.metrics import roc_auc_score
- #ada_roc_auc = roc_auc_score(#y_test, #y_pred_proba)
- print('ROC AUC score: {:.2f}'.format(#ada_roc_auc))
Gradient boosting
- Prediction error of previous prediction (e.g. predictor 1) is used to correct predecessor's error (does not change weights of training instances).
- Each predictor is trained using residual errors of predecessor as labels (Y-hat): Predictor 2 is trained using features and residual errors e1 as labels. Therefore, predicted residuals e1-hat are used to determine residuals of residuals, i.e. e2.
- Shrinkage: Prediction of each tree after multiplying it by learning rate
- Smaller learning rate should be compensated with larger number of estimators
- from sklearn.ensemble import GradientBoostingRegressor
- #model = GradientBoostingRegressor(max_depth = #4, n_estimators = #200, random_state = #2)
- #model.fit(#X_train, #y_train)
- #y_pred = #model.predict(#X_test)
- from sklearn.metrics import mean_squared_error as MSE
- #mse = MSE(#y_test, #y_pred)
- #rmse = #mse**(1/2)
- print('Test set RMSE of gb: {:.3f}'.format(#rmse))
Stochastic Gradient Boosting
- Exhaustive search procedure: Increase ensemble diversity with greater variance
- Each tree trained on random subset of rows of training data without replacement
- Not all features are used: Features sampled without replacement at each node to choose best split-points
- Residual errors are multiplied by learning rate and used in next tree
- from sklearn.ensemble import GradientBoostingRegressor
- #sgbr = GradientBoostingRegressor(max_depth = #4, subsample = #0.9, max_features = #0.75, n_estimators = #200, random_state = #2)
- #sgbr.fit(#X_train, #y_train)
- #y_pred = #sgbr.predict(#X_test)
- from sklearn.metrics import mean_squared_error as MSE
- #mse = MSE(#y_test, #y_pred)
- #rmse = #mse**(1/2)
- print('Test set RMSE of sgbr: {:.3f}'.format(#rmse))
Hyperparameters
- Parameters: Learned from data through training (e.g. split-point, split-feature)
- Hyperparameters: Not learned from data, Set prior to training (e.g. max_depth, min_samples_leaf
- Grid Search
- Random Search
- Bayesian Optimization
- Genetic Algorithms
Grid Search
- #params_dt = {#'max_depth': [2, 3, 4], #'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]} #Define params_dt
- from sklearn.model_selection import GridSearchCV
- #grid_dt = GridSearchCV(estimator = #dt, param_grid = #params_dt, scoring = #'roc_auc', cv = #5, n_jobs = #-1)
- #grid_dt.fit(#X_train, #y_train)
- #best_hyperparams = #grid_dt.best_params_
- #best_CV_score = #grid_dt.best_score_
- #best_model = #grid_dt.best_estimator_
- #test_acc = #best_model.score(#X_test, #y_test)
- from sklearn.metrics import roc_auc_score
- #best_model = #grid_dt.best_estimator_
- #y_pred_proba = #best_model.predict_proba(#X_test)[:,1]
- #test_roc_auc = roc_auc_score(#y_test, #y_pred_proba)
- print('Test set ROC AUC score: {:.3f}'.format(#test_roc_auc))