from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df[['colxs']], df[['coly']], test_size = 0.3, random_state = 42)
OR
X_train, X_test, Y_train, Y_test = train_test_split(df[['colxs']], df[['coly']], test_size = 0.3, stratify = df[['coly']], random_state = 42)
ols_data = pd.concat([X_train, Y_train], axis = 1])
One Hot Encoder
from sklearn.preprocessing import OneHotEncoder
df['col1'] = OneHotEncoder(drop='first').fit_transform(df[['col1']]).toarray()
Confusion Matrix
import sklearn.metrics as metrics
or
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = metrics.confusion_matrix(y_test, y_predicts, labels = model.classes_)
g=metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels = model.classes_)
g.plot()
Accuracy: (TP + TN) / Total Predictions
metrics.accuracy_score(y_test,y_predicts)
Precision : TP / All Positives = TP / (TP + FP)
metrics.precision_score(y_test,y_predicts)
Recall: TP / All actual true data = TP / (TP + FN)
metrics.recall_score(y_test,y_predicts)
F1: Harmonic Mean of Precision and Recall = 2 / ( 1 / Precision + 1 / Recall)
metrics.f1_score(y_test,y_predicts)
Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predicts))
ROC (receiver operating characteristic) Curve
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(y_test, y_predicts)
plt.show()
AUC (Area Under the ROC Curve)
metrics.roc_auc_score(y_test,y_predicts)
from sklearn.naive_bayes import GaussianNB
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)
gnb = GaussianNB()
gnb.fit(X_train,Y_train)
Y_predicts = gnb.predict(X_test)print('Accuracy:', accuracy_score(Y_test, Y_predicts))
print('Precision:', precision_score(Y_test, Y_predicts))
print('Recall:', recall_score(Y_test, Y_predicts))
print('F1 Score:', f1_score(Y_test, Y_predicts))
MinMaxScaler (normalizes value to [0, 1])
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler #transform to standard score
#transform X to standard score
X = StandardScaler().fit_transform(X)
#instantiate model
kmeans3 = KMeans(n_clusters=3)
kmeans3.fit(X)
print('Clusters: ', kmeans3.labels_)
print('Inertia: ', kmeans3.inertia_)
K-Means Inertia Function
def kmeans_inertia(num_clusters,X):
inertia = []
for num in num_clusters:
kms = KMeans(n_clusters = num)
kms.fit(X)
inertia.append(kms.inertia_)
return inertia
Elbow Graph
g = sns.lineplot(x=num_clusters, y=inertia, marker='o')
g.set_xlabel('Number of Clusters')
g.set_ylabel('Inertia')
K-Means Silhouette Score Function
def kmeans_silhouette(num_clusters,X):
silhouette = []
for num in num_clusters:
kms = KMeans(n_clusters = num)
kms.fit(X)
silhouette.append(silhouette_score(X,kms.labels_))
return silhouette
Elbow Graph
g = sns.lineplot(x=num_clusters, y=silhouette, marker='o')
g.set_xlabel('Number of Clusters')
g.set_ylabel('Silhouette Scores')
Decision Tree (Note: prone to overfitting)
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
decision_tree = DecisionTreeClassifier(random_state=0)
#decision trees expect numeric data, it also does not support missing values.
decision_tree.fit(X_train, Y_train)
Y_predicts = decision_tree.predict(X_test)
print("Accuracy:", "%.3f" % accuracy_score(Y_test, Y_predicts))
print("Precision:", "%.3f" % precision_score(Y_test, Y_predicts))
print("Recall:", "%.3f" % recall_score(Y_test, Y_predicts))
print("F1:", "%.3f" % f1_score(Y_test, Y_predicts))
Tree Plotting
plot_tree(decision_tree,max_depth=5,feature_names=X.columns)
or
plot_tree(decision_tree, max_depth=3, feature_names=X.columns, #Yes left, No right; gini from 0 (no impurity) to 0.5
class_names={0:'class_0', 1:'class_1'}, filled=True);
plt.show()
Grid Search Cross Validation
from sklearn.model_selection import GridSearchCV
tuned_decision_tree = DecisionTreeClassifier(random_state = 42)
tree_para = {'max_depth':[4,5,6],
'min_samples_leaf': [2,3,4]}
scores = {'accuracy', 'precision', 'recall', 'f1'}
clf = GridSearchCV(tuned_decision_tree,
tree_para,
scoring = scores,
cv=5,
refit="f1")
clf.fit(X_train, Y_train)
clf.best_estimator_
clf.best_score_
Best Tree Plotting
plot_tree(clf.best_estimator_,max_depth=5,feature_names=X.columns)
plt.plot()
from sklearn.ensemble import RandomForestClassifier
OR
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestClassifier(random_state=0)
cv_params = {dictionary} #define
scores = {'accuracy', 'precision', 'recall', 'f1'}
rf_cv = GridSearchCV(rf, cv_params, scoring=scores, cv=5, refit='f1')
rf_cv.fit(X_train, Y_train)
rf_cv.best_params_
rf_cv.best_score_
#create validation data from training data
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_Train, test_size=0.3, stratify=Y_train, random_state = 10)
#create list to indicate validation index
split_index = [0 if x in X_val.index else -1 for x in X_train.index]
from sklearn.model_selection import PredefinedSplit
rf = RandomForestClassifier(random_state=0)
cv_params = {dictionary} #define
scores = {'accuracy', 'precision', 'recall', 'f1'}
new_split = PredefinedSplit(split_index)
rf_val = GridSearchCV(rf, cv_params, scoring=scores, cv=new_split, refit='f1')
rf_val.fit(X_train, Y_train)
rf_val.best_params_
rf_val.best_score_
from xgboost import XGBClassifier
OR
from xgboost import XGBRegressor
from xgboost import plot_importance
xgb = XGBClassifier(objective='binary:logistic' ,random_state=0)
cv_params = {dictionary} #define
scores = {'accuracy', 'precision', 'recall', 'f1'}
xgb_cv = GridSearchCV(xgb, cv_params, scoring=scores, cv=5, refit='f1')
xgb_cv.fit(X_train, Y_train)
xgb_cv.best_params_
xgb_cv.best_score_
#Testing
Y_predicts = xgb_cv.predict(X_test)
print("Accuracy:", "%.3f" % accuracy_score(Y_test, Y_predicts))
print("Precision:", "%.3f" % precision_score(Y_test, Y_predicts))
print("Recall:", "%.3f" % recall_score(Y_test, Y_predicts))
print("F1:", "%.3f" % f1_score(Y_test, Y_predicts))
#Confision matrix
cm = metrics.confusion_matrix(y_test, y_predicts, labels = xgb_cv.classes_)
g=metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels = xgb_cv.classes_)
g.plot()
#Feature Importance
plot_importance(xgb_cv.best_estimator_)