Seaborn for statistical data visualization, and scikit-learn (sklearn) for general data analysis
Some example code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
np.random.seed(123)
data_dir = '/directory_to_your_data'
data = pd.read_csv(os.path.join(data_dir,'heart.csv'))
print(list(data.columns.values))
labels = data["target"]
train = data.drop(labels = ["target"],axis=1)
feature_name = list(train.columns.values)
del data
# g = sns.countplot(labels)
# labels.value_counts()
train.isnull().any().describe()
train = train.values
labels = labels.values
fig = plt.figure(figsize=(8*1.2, 6*1.5))
# loop over all vars (total: 34)
plot_train = train[:,:12]
plot_feature_name = feature_name[:12]
# for i in range(plot_train.shape[1]):
# plt.subplot(3, 4, i+1)
# f = plt.gca()
# f.axes.get_yaxis().set_visible(False)
# # f.axes.set_ylim([0, plot_train.shape[0]])
#
# # vals = np.size(plot_train.iloc[:, i].unique())
# plt.hist(plot_train[:, i])
# plt.title(plot_feature_name[i])
for i in range(plot_train.shape[1]):
plt.subplot(3, 4, i+1)
f = plt.gca()
f.axes.get_yaxis().set_visible(False)
# g = sns.countplot(plot_train[:, i])
g = sns.distplot(plot_train[:, i],kde=False, rug=False)
plt.title(plot_feature_name[i])
plt.tight_layout()
plt.show()
# plt.savefig("histogram-distribution.png")
# Normalisation of the data
train_norm = np.zeros_like(train)
train_mean = np.mean(train,axis=0)
train_std = np.std(train,axis=0)
for i in range(train.shape[1]):
temp = train[:,i]-train_mean[i]
temp /= train_std[i]
train_norm[:,i] = temp
# fig = plt.figure(figsize=(8*1.2, 6*1.5))
# for i in range(12):
# plt.subplot(3, 4, i+1)
# f = plt.gca()
# f.axes.get_yaxis().set_visible(False)
# # g = sns.countplot(plot_train[:, i])
# g = sns.distplot(train_norm[:, i],kde=False, rug=False)
# labels_norm = labels-np.mean(labels)
# labels_norm /= np.std(labels)
from sklearn.metrics import accuracy_score
# divide the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_norm, labels, test_size=0.2, random_state=12345)
####################################### logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, penalty = 'l2', solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
# evaluate on the test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(accuracy)
####################################### SVM
from sklearn.svm import SVC
# clf = SVC(kernel='linear').fit(X_train, y_train) # Linear SVMs and logistic regression generally perform comparably in practice
# clf = SVC(kernel='poly').fit(X_train, y_train) # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable.
# radial basis function or Gaussian similarity function
clf = SVC(kernel='rbf').fit(X_train, y_train)
# evaluate on the test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(accuracy)
######################################
# select features with either lasso or Feature ranking (RFE) and then apply classification
from sklearn.linear_model import LassoCV, RandomizedLasso
from sklearn.feature_selection import SelectFromModel
# using the Boston housing data.
# Data gets scaled automatically by sklearn's implementation
# rlasso = RandomizedLasso()
# rlasso.fit(X_train, y_train)
#
# print("Features sorted by their score:")
# print(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), feature_name), reverse=True))
# select1 = rlasso.scores_
# priority1 = [i[0] for i in sorted(enumerate(select1), key=lambda x:x[1],reverse=True)] # descening order
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV(cv=5)
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf,threshold=0.012)
sfm.fit(X_train, y_train)
n_features = sfm.transform(X_train).shape[1]
choose = sfm.get_support()
# # Reset the threshold till the number of features equals two.
# # Note that the attribute can be set directly instead of repeatedly
# # fitting the metatransformer.
# while n_features > 2:
# sfm.threshold += 0.01
# choose = sfm.get_support()
# X_transform = sfm.transform(X_train)
# n_features = X_transform.shape[1]
X_train_choose = sfm.transform(X_train)
X_test_choose = sfm.transform(X_test)
# from sklearn.linear_model import LinearRegression
# from sklearn.feature_selection import RFE
#
# # use linear regression as the model
# lr = LinearRegression()
# # rank all features, i.e continue the elimination until the last one
# rfe = RFE(lr, n_features_to_select=5)
# rfe.fit(X_train, y_train)
#
# print("Features sorted by their rank:")
# print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), feature_name)))
#
#
# select2 = rfe.ranking_
#
#
# priority2 = [i[0] for i in sorted(enumerate(select2), key=lambda x:x[1],reverse=False)] # ascening order
#
# Nchoose = 10
# priority_use = priority2
# X_train_choose = X_train[:,priority_use[:Nchoose]]
# X_test_choose = X_test[:,priority_use[:Nchoose]]
# run SVM again on the choosen features
clf = SVC(kernel='rbf').fit(X_train_choose, y_train)
# evaluate on the test data
y_pred = clf.predict(X_test_choose)
accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(accuracy)
######################################
# PCA and then apply classification
from sklearn.decomposition import PCA, KernelPCA
pca = PCA(n_components=12)
pca.fit(X_train)
print(pca.explained_variance_ratio_)
# map to low-dimensional space
X_train_transform = pca.fit_transform(X_train) #Fit the model with X and apply the dimensionality reduction on X.
X_test_transform = pca.fit_transform(X_test)
# run SVM again on the choosen features
clf = SVC(kernel='rbf').fit(X_train_transform, y_train)
# evaluate on the test data
y_pred = clf.predict(X_test_transform)
accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(accuracy)