Scikit-learn

Datasets

from sklearn.datasets import fetch_openml, fetch_california_housing, load_boston

import pandas as pd


data = load_boston()

df = pd.DataFrame(data.data, columns=data.feature_names)


bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)

df = bike_sharing.frame


titantic = fetch_openml(data_id=40945, as_frame=True)

df = titantic.frame


housing = fetch_california_housing(as_frame=True)

df = housing.frame

df["AveRooms"] = df["AveRooms"].round()

df = df.rename(columns={"MedHouseVal" :"y"})

Train-test Split

from sklearn.model_selection import train_test_split

from sklearn.model_selection import TimeSeriesSplit


X, y = df.iloc[:, 0:-1], df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3)

X_eval, X_test, y_eval, y_test = train_test_split(X_, y_, test_size=0.5)

Feature engineering

Standard Scaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit_transform(data)

MinMax Scaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit_transform(data)

Cyclical features

from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):

    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):

    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))


df["DAY_NUMBER_SIN"] = sin_transformer(7).fit_transform(df)["DAY_NUMBER"]

df["DAY_NUMBER_COS"] = cos_transformer(7).fit_transform(df)["DAY_NUMBER"]

Polynomial features

from sklearn.preprocessing import PolynomialFeatures

polyn = PolynomialFeatures(degree=2, include_bias=False)

poly_features = polyn.fit_transform(X)

Dimensionality reduction

PCA

from sklearn.decomposition import PCA


pca = PCA(n_components=2).fit(data)

data_2d = pca.transform(data)

Cross-validation

Keep track of "folds"

from sklearn.model_selection import Kford, StratifiedKFold, LeaveOneGroupOut, cross_validate, cross_val_score


df["kfold"] = -1

df = df.sample(frac=1).reset_index(drop=True) # shuffle data


kf = KFold(n_splits=5)

kf = StratifiedKFold(n_splits=5)

for fold, (_, ix) in enumerate(kf.split(X=df)):

    df.loc[ix, 'kfold'] = fold

 

kf = LeaveOneGroupOut()

for fold, (train_index, test_index) in enumerate(kf.split(X=df, groups=df["year"])):

    df.loc[train_index, 'kfold'] = fold

 

import pandas as pd

scores = cross_val_score(

    model, X_train, y_train, scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "max_error"], cv=5

)

pd.DataFrame(scores)

Models

See parameters

estimator.get_params()

Linear models

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression, LogisticRegressionCV


log_model = LogisticRegression(solver="saga", multi_class="ovr", max_iter=5000)

Clustering

from sklearn.cluster import KMeans, KNeighborsClassifier, AgglomerativeClustering


km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=1000)

km.inertia_ # within cluster sum of squares (lower value means data points are closer)


knn = KNeighborsClassifier()

knn.get_params().keys()


agg_k = AgglomerativeClustering(n_clusters=3)

agg_clusters = agg_k.fit_predict(data)

One Class SVM

from sklearn.svm import OneClassSVM


ocsvm = OneClassSVM(nu=0.001)

SVM Classifier

from sklearn.svm import SVC


svc = SVC(kernel='linear', C=1000, gamma=0.01)

SVM Regressor

from sklearn.svm import SVR, LinearSVR



Decision tree

from sklearn.tree import DecisionTreeClassifier


DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3, criterion="entropy")


Isolation Forest

from sklearn.ensemble import IsolationForest


isofor = IsolationForest(contamination=0.001) 

Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor


rfr = RandomForestRegressor(n_jobs=-1)

Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(n_jobs=-1, n_estimators=10, max_features="auto", random_state=101)

AdaBoost

from sklearn.ensemble import AdaBoostRegressor


model = AdaBoostRegressor()

Hist Gradient Boosting Classifier

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

from sklearn.ensemble import HistGradientBoostingClassifier


hgbc = HistGradientBoostingClassifier(max_iter=100, verbose=1, )

Hyper-parameter tuning

from sklearn.model_selection import GridSearchCV


param_grid = dict(n_estimators=[100, 200], n_jobs=[-1])

param_grid = {

    "C": [0.001, 0.01, 0.1, 0.5, 1],

    "kernel": ["linear", "rbf", "poly"],

    "gamma": ["scale", "auto"],

    "degree": [2, 3, 4],

    "epsilon": [0, 0.01, 0.1, 0.5, 1, 2],

}


search = GridSearchCV(estimator, param_grid=param_grid, cv=5, verbose=2)

search.fit(X, y)

search.best_estimator_

search.best_params_

Pipeline

x is shape 10 with cat cols 0:5 and num cols 6:9

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.linear_model import LinearRegression

import numpy as np


# Define preprocessing for numeric columns (scale them)

numeric_features = [6, 7, 8, 9]

numeric_transformer = Pipeline(steps=[

    ('scaler', StandardScaler())])


# Define preprocessing for categorical features (encode them)

categorical_features = [0, 1, 2, 3, 4, 5]

categorical_transformer = Pipeline(steps=[

    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# Combine preprocessing steps

preprocessor = ColumnTransformer(

    transformers=[

        ('num', numeric_transformer, numeric_features),

        ('cat', categorical_transformer, categorical_features)],

    remainder='passthrough')


# Create preprocessing and training pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),

                           ('regressor', LinearRegression(normalize=False))])



# fit the pipeline to train a linear regression model on the training set

model = pipeline.fit(X_train, (y_train))


 

param_grid = {'knn__n_neighbors': k_values}

Feature importance

Regression

feat_imp = pd.DataFrame({'importance':clf.feature_importances})

feat_imp['feature'] = X.columns

feat_imp = feat_imp.sort_values('importance')

Classifier

feat_imp = pd.DataFrame({'importance':clf.feature_importances_})

feat_imp['feature'] = X.columns

feat_imp = feat_imp.sort_values('importance', ascending=False)

Metrics

Regression

from sklearn.metrics import (

    mean_absolute_error,

    mean_squared_error,

    mean_absolute_percentage_error,

    explained_variance_score,

    max_error,

    mean_squared_log_error,

    median_absolute_error,

    r2_score,

)


print('evs:', explained_variance_score(y_true, y_pred))

print('mse:', mean_squared_error(y_true, y_pred))

print('rmse:', mean_squared_error(y_true, y_pred, squared=False))

print('mae:', mean_absolute_error(y_true, y_pred))

print('mape:', mean_absolute_percentage_error(y_true, y_pred))

print('r2:', r2_score(y_true, y_pred)

print('max_error:', max_error(y_true, y_pred))

print('msle:', mean_squared_log_error(y_true, y_pred))

print('meadian_ae:', median_absolute_error(y_true, y_pred))

Classification

from sklearn.metrics import (

    accuracy_score,

    auc,

    confusion_matrix,

    classification_report,

    f1_score,

    recall_score,

    precision_recall_curve,

    PrecisionRecallDisplay,

    roc_curve,

    ConfusionMatrixDisplay,

)


accuracy_score(y_true, y_pred)


print(classification_report(y_true, y_pred, target_names=['class 1', 'class 2']))


fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
roc_auc = auc(fpr, tpr)


cm = confusion_matrix(y_penguin_test, penguin_predictions)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)

disp.plot()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

plt.show()

Plot ROC 

plt.figure()

plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)

plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")

plt.xlim([0.0, 1.0])

plt.ylim([0.0, 1.05])

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

plt.title("Receiver operating characteristic")

plt.legend(loc="lower right")

plt.show()

Plot confusion matrix

plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)

plt.colorbar()

tick_marks = np.arange(len(classes))

plt.xticks(tick_marks, classes, rotation=45)

plt.yticks(tick_marks, classes)

plt.xlabel("Actual")

plt.ylabel("Predicted")

plt.show()

Visualization

HTML repr

from sklearn import set_config

set_config(display='diagram')

 plot tree

from sklearn.tree import plot_tree

plot_tree(model, filled=True, feature_names=X.columns)