Code

Link to Google Colabs: https://colab.research.google.com/drive/1yePOmMll8h2vloGtZQ6a7S6TY_jy4AD1?usp=sharing

Link to Breast Cancer Data: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data


!pip install scikit-learn==1.0

from sklearn import tree

from sklearn import svm

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.metrics import plot_roc_curve

import graphviz

df_cancer_unedited = pd.read_csv('data.csv')

# data clean up

df_cancer = df_cancer_unedited.drop('Unnamed: 32', axis = 1)

df_blinded = df_cancer.drop('id', axis= 1)

df_X = df_blinded.drop("diagnosis", axis = 1)

s_y = df_blinded["diagnosis"]

# Cross validation helper method

def partition_data( df_X, s_y, k ):

rand_indx = np.random.randint(low=1, high=k+1, size=len(df_X))

fold_dict_X = dict()

fold_dict_Y = dict()

for i in range(1, k+1):

fold_dict_X[i] = df_X[rand_indx == i]

fold_dict_Y[i] = s_y[rand_indx == i]

return (fold_dict_X, fold_dict_Y)

(dict_k_df_X, dict_k_s_y) = partition_data( df_X, s_y, 5 )


error_list = []

from numpy.ma.extras import average

from numpy import mean

# Creates the test and training set for each K loop

dx = pd.DataFrame()

dy = pd.DataFrame()

i = 0

for k in dict_k_df_X:

test = pd.DataFrame(dict_k_df_X[k])

test_y = pd.DataFrame(dict_k_s_y[k]).to_numpy()

train_x = df_X.copy()

train_y = s_y.copy()

for j in range(0, len(test)):

row = test.iloc[j]

train_x = train_x.drop(row.name)

train_y = train_y.drop(row.name)

if i == 2:

dx = train_x

dy = train_y

clf = tree.DecisionTreeClassifier(max_depth = 5)

clf = clf.fit(train_x, train_y)

prediction_array = clf.predict(test)

error = get_error(prediction_array, dict_k_s_y[k].to_numpy())

error_list.append(error)

if i == 0:

f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))

else:

plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)

i += 1

print(mean(error_list))


# Print out a desision tree for Gini Index

dX_feature_names = list(dx.columns)

clf = tree.DecisionTreeClassifier(max_depth=5)

clf = clf.fit(dx, dy)

import graphviz

class_name = ['M', 'B']

dot_data = tree.export_graphviz(clf, out_file=None, feature_names= dX_feature_names, filled= True, rounded = True, special_characters=True, class_names=class_name)

graph = graphviz.Source(dot_data)

graph

error_list = []

from numpy.ma.extras import average

from numpy import mean

# Creates the test and training set for each K loop

i = 0

dx = pd.DataFrame()

dy = pd.DataFrame()

for k in dict_k_df_X:

test = pd.DataFrame(dict_k_df_X[k])

test_y = pd.DataFrame(dict_k_s_y[k])

train_x = df_X.copy()

train_y = s_y.copy()

if i == 2:

dx = train_x

dy = train_y

for j in range(0, len(test)):

row = test.iloc[j]

train_x = train_x.drop(row.name)

train_y = train_y.drop(row.name)

clf = tree.DecisionTreeClassifier(max_depth = 5, criterion='entropy')

clf = clf.fit(train_x, train_y)

prediction_array = clf.predict(test)

error = get_error(prediction_array, dict_k_s_y[k].to_numpy())

error_list.append(error)

if i == 0:

f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))

else:

plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)

i += 1

print(mean(error_list))


# Print out a decision tree for Entropy

dX_feature_names = list(dx.columns)

clf = tree.DecisionTreeClassifier(max_depth=5, criterion='entropy')

clf = clf.fit(dx, dy)

import graphviz

class_name = ['M', 'B']

dot_data = tree.export_graphviz(clf, out_file=None, feature_names= dX_feature_names, filled= True, rounded = True, special_characters=True, class_names=class_name)

graph = graphviz.Source(dot_data)

graph

def get_error(prediction, real_values):

correct = 0

size = prediction.size

range2 = np.arange(0, size)

for i in range2:

if prediction[i] == real_values[i]:

correct += 1

return ((prediction.size - correct)/prediction.size)


def error_knn(prediction, real_values):

correct = 0

size = len(prediction)

range2 = np.arange(0, size)

for i in range2:

if prediction[i] == real_values[i]:

correct += 1

return ((size - correct)/size)


# Normalizing the data to be between [0, 1]

dict_normalize = dict()

df_normalized = df_blinded.copy()

for col in df_normalized.columns:

if col != "diagnosis":

min = df_normalized[col].min()

max = df_normalized[col].max()

dict_normalize[col] = (min, max)

df_normalized[col] = (df_normalized[col] - min) / (max - min)


#KNN method

def knn_class(df_train, k, target_col, observation, use_weighted_vote ):

df_attr = df_train.drop(target_col, axis = 1)

df_dist = df_train.copy()

df_dist['Distance'] = (df_attr - observation).apply(np.linalg.norm, axis = 1)

df_dist["Weight"] = (1/(df_dist['Distance']*df_dist['Distance']))

df_dist = df_dist.sort_values(by='Distance')

df_k = df_dist[0:k].reset_index()

counts = df_k[target_col].value_counts()

# majority voting

if not use_weighted_vote:

return counts.idxmax()

#weighted votes

else:

b_weight = df_k[df_k[target_col] == 'B']['Weight'].sum()

m_weight = df_k[df_k[target_col] == 'M']['Weight'].sum()

if (b_weight > m_weight):

return 'B'

elif (m_weight > b_weight):

return 'M'


poss_k = [1,3,13,25,50,100] # possible k's

error_k_majority = list(np.zeros(len(poss_k))) # Accuracy for each value of k using majority voting

error_k_weighted = list(np.zeros(len(poss_k))) # Accuracy for each value of k using weighted voting

# Cross Validation

df_X_2 = df_normalized.drop("diagnosis", axis = 1)

s_y_2 = df_normalized["diagnosis"]

(dict_k_df_X_norm, dict_k_s_y_norm) = partition_data( df_X_2, s_y_2, 5 )

for p in range(len(poss_k)):

error_weighted = []

error_majority = []

for k in dict_k_df_X_norm:

test = pd.DataFrame(dict_k_df_X_norm[k])

test_y = pd.DataFrame(dict_k_s_y_norm[k])

train_x_norm = df_X_2.copy()

train_y_norm = s_y_2.copy()

for j in range(0, len(test)):

row = test.iloc[j]

train_x_norm = train_x_norm.drop(row.name)

train_y_norm = train_y_norm.drop(row.name)

training_total = train_x_norm.copy()

training_total['diagnosis'] = train_y_norm

# weighted voting

df_tester = test

predictions_knn1 = []

for i in range(len(df_tester)):

predictions_knn1.append(knn_class(training_total, poss_k[p], 'diagnosis', df_tester.iloc[i], True))

error_weighted.append(error_knn(predictions_knn1, test_y.to_numpy()))

# majority voting

df_tester = test

predictions_knn2 = []

for i in range(len(df_tester)):

predictions_knn2.append(knn_class(training_total, poss_k[p], 'diagnosis', df_tester.iloc[i], False))

error_majority.append(error_knn(predictions_knn2, test_y.to_numpy()))

error_k_weighted[p] = mean(error_weighted)

error_k_majority[p] = mean(error_majority)

plt.plot(poss_k, error_k_majority, color='green', label="Majority Voting", marker='o')

plt.plot(poss_k, error_k_weighted, color='red', label="Weighted Voting", marker='o')

plt.title("Error on Test Set of Varied K")

plt.xlabel("k")

plt.ylabel("Error")

plt.legend()

plt.show()

# Calculate prior probabilities

dict_priors = dict()

types = df_blinded['diagnosis'].value_counts(dropna=True)

for i in range(types.size):

dict_priors[types.index[i]] = (types[i]/types.sum())

# Calculate distribution of numeric attributes in a tuple with mean and standard deviation

dict_nb_model = dict()

for target in dict_priors.keys():

dict_inner = dict()

type_subset = df_blinded[df_blinded["diagnosis"] == target]

type_subset = type_subset.drop(columns='diagnosis')

for (col, row) in type_subset.iteritems():

mean = type_subset[col].mean()

std = type_subset[col].std(skipna=True)

dict_inner[col] = (mean, std)

dict_nb_model[target] = dict_inner

# Gaussian probability

def get_p( mu, sigma, x):

return ((1/(sigma*np.sqrt(2*np.pi))*np.exp(-.5*np.square(((x - mu)/sigma)))))


#Naive Bayes Function

def nb_class( dict_priors, dict_nb_model, observation):

dict_probs = dict()

# Go through each classification and add the observation prob to a dictionary

for type_val in dict_priors.keys():

prob = dict_priors[type_val]

for i in range(observation.size):

if not pd.isna(observation[i]):

if observation.index[i] != 'diagnosis':

mu, sigma = dict_nb_model[type_val][observation.index[i]]

prob *= get_p(mu, sigma, observation[i])

dict_probs[type_val] = prob

return max(dict_probs, key = dict_probs.get)


# using sklearn nb function

gnb_error = []

from numpy.ma.extras import average

from numpy import mean

i = 0

for k in dict_k_df_X:

test = pd.DataFrame(dict_k_df_X[k])

test_y = pd.DataFrame(dict_k_s_y[k])

train_x = df_X.copy()

train_y = s_y.copy()

for j in range(0, len(test)):

row = test.iloc[j]

train_x = train_x.drop(row.name)

train_y = train_y.drop(row.name)

gnb = GaussianNB()

gnb = gnb.fit(df_X, s_y)

prediction_array = gnb.predict(test)

error = (get_error(prediction_array, dict_k_s_y[k].to_numpy()))

gnb_error.append(error)

if i == 0:

f1 = plot_roc_curve(gnb, test, test_y, name='ROC fold {}'.format(i + 1))

else:

plot_roc_curve(gnb, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)

i += 1

print(mean(gnb_error))


error_list = []

# Creates the test and training set for each K loop

i = 0

for k in dict_k_df_X:

test = pd.DataFrame(dict_k_df_X[k])

test_y = pd.DataFrame(dict_k_s_y[k])

train_x = df_X.copy()

train_y = s_y.copy()

for j in range(0, len(test)):

row = test.iloc[j]

train_x = train_x.drop(row.name)

train_y = train_y.drop(row.name)

clf = svm.SVC()

clf = clf.fit(train_x, train_y)

prediction_array = clf.predict(test)

error = get_error(prediction_array, dict_k_s_y[k].to_numpy())

error_list.append(error)

if i == 0:

f1 = plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1))

else:

plot_roc_curve(clf, test, test_y, name='ROC fold {}'.format(i + 1), ax=f1.ax_)

i += 1

print(mean(error_list))