Code
Link to Google Colabs: https://colab.research.google.com/drive/1yePOmMll8h2vloGtZQ6a7S6TY_jy4AD1?usp=sharing
Link to Breast Cancer Data: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
!pip install scikit-learn==1.0
from sklearn import tree
from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import plot_roc_curve
import graphviz
df_cancer_unedited = pd.read_csv('data.csv')
# data clean up
df_cancer = df_cancer_unedited.drop('Unnamed: 32', axis = 1)
df_blinded = df_cancer.drop('id', axis= 1)
df_X = df_blinded.drop("diagnosis", axis = 1)
s_y = df_blinded["diagnosis"]
# Cross validation helper method
def partition_data( df_X, s_y, k ):
rand_indx = np.random.randint(low=1, high=k+1, size=len(df_X))
fold_dict_X = dict()
fold_dict_Y = dict()
for i in range(1, k+1):
fold_dict_X[i] = df_X[rand_indx == i]
fold_dict_Y[i] = s_y[rand_indx == i]
return (fold_dict_X, fold_dict_Y)
(dict_k_df_X, dict_k_s_y) = partition_data( df_X, s_y, 5 )
error_list = []
from numpy.ma.extras import average
from numpy import mean
# Creates the test and training set for each K loop
dx = pd.DataFrame()
dy = pd.DataFrame()
i = 0
for k in dict_k_df_X:
test = pd.DataFrame(dict_k_df_X[k])
test_y = pd.DataFrame(dict_k_s_y[k]).to_numpy()
train_x = df_X.copy()
train_y = s_y.copy()
for j in range(0, len(test)):
row = test.iloc[j]
train_x = train_x.drop(row.name)
train_y = train_y.drop(row.name)
if i == 2:
dx = train_x
dy = train_y
clf = tree.DecisionTreeClassifier(max_depth = 5)
clf = clf.fit(train_x, train_y)
prediction_array = clf.predict(test)
error = get_error(prediction_array, dict_k_s_y[k].to_numpy())
error_list.append(error)
if i == 0:
f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))
else:
plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)
i += 1
print(mean(error_list))
# Print out a desision tree for Gini Index
dX_feature_names = list(dx.columns)
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(dx, dy)
import graphviz
class_name = ['M', 'B']
dot_data = tree.export_graphviz(clf, out_file=None, feature_names= dX_feature_names, filled= True, rounded = True, special_characters=True, class_names=class_name)
graph = graphviz.Source(dot_data)
graph
error_list = []
from numpy.ma.extras import average
from numpy import mean
# Creates the test and training set for each K loop
i = 0
dx = pd.DataFrame()
dy = pd.DataFrame()
for k in dict_k_df_X:
test = pd.DataFrame(dict_k_df_X[k])
test_y = pd.DataFrame(dict_k_s_y[k])
train_x = df_X.copy()
train_y = s_y.copy()
if i == 2:
dx = train_x
dy = train_y
for j in range(0, len(test)):
row = test.iloc[j]
train_x = train_x.drop(row.name)
train_y = train_y.drop(row.name)
clf = tree.DecisionTreeClassifier(max_depth = 5, criterion='entropy')
clf = clf.fit(train_x, train_y)
prediction_array = clf.predict(test)
error = get_error(prediction_array, dict_k_s_y[k].to_numpy())
error_list.append(error)
if i == 0:
f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))
else:
plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)
i += 1
print(mean(error_list))
# Print out a decision tree for Entropy
dX_feature_names = list(dx.columns)
clf = tree.DecisionTreeClassifier(max_depth=5, criterion='entropy')
clf = clf.fit(dx, dy)
import graphviz
class_name = ['M', 'B']
dot_data = tree.export_graphviz(clf, out_file=None, feature_names= dX_feature_names, filled= True, rounded = True, special_characters=True, class_names=class_name)
graph = graphviz.Source(dot_data)
graph
def get_error(prediction, real_values):
correct = 0
size = prediction.size
range2 = np.arange(0, size)
for i in range2:
if prediction[i] == real_values[i]:
correct += 1
return ((prediction.size - correct)/prediction.size)
def error_knn(prediction, real_values):
correct = 0
size = len(prediction)
range2 = np.arange(0, size)
for i in range2:
if prediction[i] == real_values[i]:
correct += 1
return ((size - correct)/size)
# Normalizing the data to be between [0, 1]
dict_normalize = dict()
df_normalized = df_blinded.copy()
for col in df_normalized.columns:
if col != "diagnosis":
min = df_normalized[col].min()
max = df_normalized[col].max()
dict_normalize[col] = (min, max)
df_normalized[col] = (df_normalized[col] - min) / (max - min)
#KNN method
def knn_class(df_train, k, target_col, observation, use_weighted_vote ):
df_attr = df_train.drop(target_col, axis = 1)
df_dist = df_train.copy()
df_dist['Distance'] = (df_attr - observation).apply(np.linalg.norm, axis = 1)
df_dist["Weight"] = (1/(df_dist['Distance']*df_dist['Distance']))
df_dist = df_dist.sort_values(by='Distance')
df_k = df_dist[0:k].reset_index()
counts = df_k[target_col].value_counts()
# majority voting
if not use_weighted_vote:
return counts.idxmax()
#weighted votes
else:
b_weight = df_k[df_k[target_col] == 'B']['Weight'].sum()
m_weight = df_k[df_k[target_col] == 'M']['Weight'].sum()
if (b_weight > m_weight):
return 'B'
elif (m_weight > b_weight):
return 'M'
poss_k = [1,3,13,25,50,100] # possible k's
error_k_majority = list(np.zeros(len(poss_k))) # Accuracy for each value of k using majority voting
error_k_weighted = list(np.zeros(len(poss_k))) # Accuracy for each value of k using weighted voting
# Cross Validation
df_X_2 = df_normalized.drop("diagnosis", axis = 1)
s_y_2 = df_normalized["diagnosis"]
(dict_k_df_X_norm, dict_k_s_y_norm) = partition_data( df_X_2, s_y_2, 5 )
for p in range(len(poss_k)):
error_weighted = []
error_majority = []
for k in dict_k_df_X_norm:
test = pd.DataFrame(dict_k_df_X_norm[k])
test_y = pd.DataFrame(dict_k_s_y_norm[k])
train_x_norm = df_X_2.copy()
train_y_norm = s_y_2.copy()
for j in range(0, len(test)):
row = test.iloc[j]
train_x_norm = train_x_norm.drop(row.name)
train_y_norm = train_y_norm.drop(row.name)
training_total = train_x_norm.copy()
training_total['diagnosis'] = train_y_norm
# weighted voting
df_tester = test
predictions_knn1 = []
for i in range(len(df_tester)):
predictions_knn1.append(knn_class(training_total, poss_k[p], 'diagnosis', df_tester.iloc[i], True))
error_weighted.append(error_knn(predictions_knn1, test_y.to_numpy()))
# majority voting
df_tester = test
predictions_knn2 = []
for i in range(len(df_tester)):
predictions_knn2.append(knn_class(training_total, poss_k[p], 'diagnosis', df_tester.iloc[i], False))
error_majority.append(error_knn(predictions_knn2, test_y.to_numpy()))
error_k_weighted[p] = mean(error_weighted)
error_k_majority[p] = mean(error_majority)
plt.plot(poss_k, error_k_majority, color='green', label="Majority Voting", marker='o')
plt.plot(poss_k, error_k_weighted, color='red', label="Weighted Voting", marker='o')
plt.title("Error on Test Set of Varied K")
plt.xlabel("k")
plt.ylabel("Error")
plt.legend()
plt.show()
# Calculate prior probabilities
dict_priors = dict()
types = df_blinded['diagnosis'].value_counts(dropna=True)
for i in range(types.size):
dict_priors[types.index[i]] = (types[i]/types.sum())
# Calculate distribution of numeric attributes in a tuple with mean and standard deviation
dict_nb_model = dict()
for target in dict_priors.keys():
dict_inner = dict()
type_subset = df_blinded[df_blinded["diagnosis"] == target]
type_subset = type_subset.drop(columns='diagnosis')
for (col, row) in type_subset.iteritems():
mean = type_subset[col].mean()
std = type_subset[col].std(skipna=True)
dict_inner[col] = (mean, std)
dict_nb_model[target] = dict_inner
# Gaussian probability
def get_p( mu, sigma, x):
return ((1/(sigma*np.sqrt(2*np.pi))*np.exp(-.5*np.square(((x - mu)/sigma)))))
#Naive Bayes Function
def nb_class( dict_priors, dict_nb_model, observation):
dict_probs = dict()
# Go through each classification and add the observation prob to a dictionary
for type_val in dict_priors.keys():
prob = dict_priors[type_val]
for i in range(observation.size):
if not pd.isna(observation[i]):
if observation.index[i] != 'diagnosis':
mu, sigma = dict_nb_model[type_val][observation.index[i]]
prob *= get_p(mu, sigma, observation[i])
dict_probs[type_val] = prob
return max(dict_probs, key = dict_probs.get)
# using sklearn nb function
gnb_error = []
from numpy.ma.extras import average
from numpy import mean
i = 0
for k in dict_k_df_X:
test = pd.DataFrame(dict_k_df_X[k])
test_y = pd.DataFrame(dict_k_s_y[k])
train_x = df_X.copy()
train_y = s_y.copy()
for j in range(0, len(test)):
row = test.iloc[j]
train_x = train_x.drop(row.name)
train_y = train_y.drop(row.name)
gnb = GaussianNB()
gnb = gnb.fit(df_X, s_y)
prediction_array = gnb.predict(test)
error = (get_error(prediction_array, dict_k_s_y[k].to_numpy()))
gnb_error.append(error)
if i == 0:
f1 = plot_roc_curve(gnb, test, test_y, name='ROC fold {}'.format(i + 1))
else:
plot_roc_curve(gnb, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)
i += 1
print(mean(gnb_error))
error_list = []
# Creates the test and training set for each K loop
i = 0
for k in dict_k_df_X:
test = pd.DataFrame(dict_k_df_X[k])
test_y = pd.DataFrame(dict_k_s_y[k])
train_x = df_X.copy()
train_y = s_y.copy()
for j in range(0, len(test)):
row = test.iloc[j]
train_x = train_x.drop(row.name)
train_y = train_y.drop(row.name)
clf = svm.SVC()
clf = clf.fit(train_x, train_y)
prediction_array = clf.predict(test)
error = get_error(prediction_array, dict_k_s_y[k].to_numpy())
error_list.append(error)
if i == 0:
f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))
else:
plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)
i += 1
print(mean(error_list))