import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
data = pd.read_csv("/home/umesh/Desktop/icfoss/bcdata.csv")
data.drop(['Unnamed: 32'], axis=1, inplace=True)
data.drop(['id'], axis=1, inplace=True)
data.diagnosis.value_counts()
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis]
y = data.diagnosis.values
x = data.drop(['diagnosis'], axis=1)
x = (x - np.min(x) ) / ( np.max(x) - np.min(x) ).values
f, axis = plt.subplots(figsize = (18,18))
sns.heatmap(data.corr(), annot = False, linewidths = .4)
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3 , random_state = 1)
# svm
from sklearn.svm import SVC
svm = SVC(random_state = 1)
fit=svm.fit(x_train,y_train)
print("accuracy of svm algo:",svm.score(x_test,y_test))
from sklearn import svm
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
random_state = np.random.RandomState(0)
classifier = svm.SVC(kernel='linear', probability=True)
probas_ = classifier.fit(x_train, y_train).predict_proba(x_test)
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
# Plot ROC curve
import pylab as pl
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()