Scikit-learn
Datasets
from sklearn.datasets import fetch_openml, fetch_california_housing, load_boston
import pandas as pd
data = load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
df = bike_sharing.frame
titantic = fetch_openml(data_id=40945, as_frame=True)
df = titantic.frame
housing = fetch_california_housing(as_frame=True)
df = housing.frame
df["AveRooms"] = df["AveRooms"].round()
df = df.rename(columns={"MedHouseVal" :"y"})
Train-test Split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
X, y = df.iloc[:, 0:-1], df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3)
X_eval, X_test, y_eval, y_test = train_test_split(X_, y_, test_size=0.5)
Feature engineering
Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(data)
MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit_transform(data)
Cyclical features
from sklearn.preprocessing import FunctionTransformer
def sin_transformer(period):
return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))
def cos_transformer(period):
return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))
df["DAY_NUMBER_SIN"] = sin_transformer(7).fit_transform(df)["DAY_NUMBER"]
df["DAY_NUMBER_COS"] = cos_transformer(7).fit_transform(df)["DAY_NUMBER"]
Polynomial features
from sklearn.preprocessing import PolynomialFeatures
polyn = PolynomialFeatures(degree=2, include_bias=False)
poly_features = polyn.fit_transform(X)
Dimensionality reduction
PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(data)
data_2d = pca.transform(data)
Cross-validation
Keep track of "folds"
from sklearn.model_selection import Kford, StratifiedKFold, LeaveOneGroupOut, cross_validate, cross_val_score
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True) # shuffle data
kf = KFold(n_splits=5)
kf = StratifiedKFold(n_splits=5)
for fold, (_, ix) in enumerate(kf.split(X=df)):
df.loc[ix, 'kfold'] = fold
kf = LeaveOneGroupOut()
for fold, (train_index, test_index) in enumerate(kf.split(X=df, groups=df["year"])):
df.loc[train_index, 'kfold'] = fold
import pandas as pd
scores = cross_val_score(
model, X_train, y_train, scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "max_error"], cv=5
)
pd.DataFrame(scores)
Models
See parameters
estimator.get_params()
Linear models
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression, LogisticRegressionCV
log_model = LogisticRegression(solver="saga", multi_class="ovr", max_iter=5000)
Clustering
from sklearn.cluster import KMeans, KNeighborsClassifier, AgglomerativeClustering
km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=1000)
km.inertia_ # within cluster sum of squares (lower value means data points are closer)
knn = KNeighborsClassifier()
knn.get_params().keys()
agg_k = AgglomerativeClustering(n_clusters=3)
agg_clusters = agg_k.fit_predict(data)
One Class SVM
from sklearn.svm import OneClassSVM
ocsvm = OneClassSVM(nu=0.001)
SVM Classifier
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=1000, gamma=0.01)
SVM Regressor
from sklearn.svm import SVR, LinearSVR
Decision tree
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3, criterion="entropy")
Isolation Forest
from sklearn.ensemble import IsolationForest
isofor = IsolationForest(contamination=0.001)
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_jobs=-1)
Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=10, max_features="auto", random_state=101)
AdaBoost
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
Hist Gradient Boosting Classifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier(max_iter=100, verbose=1, )
Hyper-parameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = dict(n_estimators=[100, 200], n_jobs=[-1])
param_grid = {
"C": [0.001, 0.01, 0.1, 0.5, 1],
"kernel": ["linear", "rbf", "poly"],
"gamma": ["scale", "auto"],
"degree": [2, 3, 4],
"epsilon": [0, 0.01, 0.1, 0.5, 1, 2],
}
search = GridSearchCV(estimator, param_grid=param_grid, cv=5, verbose=2)
search.fit(X, y)
search.best_estimator_
search.best_params_
Pipeline
x is shape 10 with cat cols 0:5 and num cols 6:9
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
import numpy as np
# Define preprocessing for numeric columns (scale them)
numeric_features = [6, 7, 8, 9]
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
# Define preprocessing for categorical features (encode them)
categorical_features = [0, 1, 2, 3, 4, 5]
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)],
remainder='passthrough')
# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('regressor', LinearRegression(normalize=False))])
# fit the pipeline to train a linear regression model on the training set
model = pipeline.fit(X_train, (y_train))
param_grid = {'knn__n_neighbors': k_values}
Feature importance
Regression
feat_imp = pd.DataFrame({'importance':clf.feature_importances})
feat_imp['feature'] = X.columns
feat_imp = feat_imp.sort_values('importance')
Classifier
feat_imp = pd.DataFrame({'importance':clf.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp = feat_imp.sort_values('importance', ascending=False)
Metrics
Regression
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
mean_absolute_percentage_error,
explained_variance_score,
max_error,
mean_squared_log_error,
median_absolute_error,
r2_score,
)
print('evs:', explained_variance_score(y_true, y_pred))
print('mse:', mean_squared_error(y_true, y_pred))
print('rmse:', mean_squared_error(y_true, y_pred, squared=False))
print('mae:', mean_absolute_error(y_true, y_pred))
print('mape:', mean_absolute_percentage_error(y_true, y_pred))
print('r2:', r2_score(y_true, y_pred)
print('max_error:', max_error(y_true, y_pred))
print('msle:', mean_squared_log_error(y_true, y_pred))
print('meadian_ae:', median_absolute_error(y_true, y_pred))
Classification
from sklearn.metrics import (
accuracy_score,
auc,
confusion_matrix,
classification_report,
f1_score,
recall_score,
precision_recall_curve,
PrecisionRecallDisplay,
roc_curve,
ConfusionMatrixDisplay,
)
accuracy_score(y_true, y_pred)
print(classification_report(y_true, y_pred, target_names=['class 1', 'class 2']))
fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
roc_auc = auc(fpr, tpr)
cm = confusion_matrix(y_penguin_test, penguin_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()
Plot ROC
plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
Plot confusion matrix
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
Visualization
HTML repr
from sklearn import set_config
set_config(display='diagram')
plot tree
from sklearn.tree import plot_tree
plot_tree(model, filled=True, feature_names=X.columns)