Qingju LIU's homepage - 190204 seaborn sklearn

Seaborn for statistical data visualization, and scikit-learn (sklearn) for general data analysis

Some example code:

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import os

import seaborn as sns

np.random.seed(123)

data_dir = '/directory_to_your_data'

data = pd.read_csv(os.path.join(data_dir,'heart.csv'))

print(list(data.columns.values))

labels = data["target"]

train = data.drop(labels = ["target"],axis=1)

feature_name = list(train.columns.values)

del data

# g = sns.countplot(labels)

# labels.value_counts()

train.isnull().any().describe()

train = train.values

labels = labels.values

fig = plt.figure(figsize=(8*1.2, 6*1.5))

# loop over all vars (total: 34)

plot_train = train[:,:12]

plot_feature_name = feature_name[:12]

# for i in range(plot_train.shape[1]):

# plt.subplot(3, 4, i+1)

# f = plt.gca()

# f.axes.get_yaxis().set_visible(False)

# # f.axes.set_ylim([0, plot_train.shape[0]])

# # vals = np.size(plot_train.iloc[:, i].unique())

# plt.hist(plot_train[:, i])

# plt.title(plot_feature_name[i])

for i in range(plot_train.shape[1]):

plt.subplot(3, 4, i+1)

f = plt.gca()

f.axes.get_yaxis().set_visible(False)

# g = sns.countplot(plot_train[:, i])

g = sns.distplot(plot_train[:, i],kde=False, rug=False)

plt.title(plot_feature_name[i])

plt.tight_layout()

plt.show()

# plt.savefig("histogram-distribution.png")

# Normalisation of the data

train_norm = np.zeros_like(train)

train_mean = np.mean(train,axis=0)

train_std = np.std(train,axis=0)

for i in range(train.shape[1]):

temp = train[:,i]-train_mean[i]

temp /= train_std[i]

train_norm[:,i] = temp

# fig = plt.figure(figsize=(8*1.2, 6*1.5))

# for i in range(12):

# plt.subplot(3, 4, i+1)

# f = plt.gca()

# f.axes.get_yaxis().set_visible(False)

# # g = sns.countplot(plot_train[:, i])

# g = sns.distplot(train_norm[:, i],kde=False, rug=False)

# labels_norm = labels-np.mean(labels)

# labels_norm /= np.std(labels)

from sklearn.metrics import accuracy_score

# divide the data into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_norm, labels, test_size=0.2, random_state=12345)

####################################### logistic regression

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, penalty = 'l2', solver='lbfgs', multi_class='ovr').fit(X_train, y_train)

# evaluate on the test data

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

print(accuracy)

####################################### SVM

from sklearn.svm import SVC

# clf = SVC(kernel='linear').fit(X_train, y_train) # Linear SVMs and logistic regression generally perform comparably in practice

# clf = SVC(kernel='poly').fit(X_train, y_train) # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable.

# radial basis function or Gaussian similarity function

clf = SVC(kernel='rbf').fit(X_train, y_train)

# evaluate on the test data

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

print(accuracy)

######################################

# select features with either lasso or Feature ranking (RFE) and then apply classification

from sklearn.linear_model import LassoCV, RandomizedLasso

from sklearn.feature_selection import SelectFromModel

# using the Boston housing data.

# Data gets scaled automatically by sklearn's implementation

# rlasso = RandomizedLasso()

# rlasso.fit(X_train, y_train)

# print("Features sorted by their score:")

# print(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), feature_name), reverse=True))

# select1 = rlasso.scores_

# priority1 = [i[0] for i in sorted(enumerate(select1), key=lambda x:x[1],reverse=True)] # descening order

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.

clf = LassoCV(cv=5)

# Set a minimum threshold of 0.25

sfm = SelectFromModel(clf,threshold=0.012)

sfm.fit(X_train, y_train)

n_features = sfm.transform(X_train).shape[1]

choose = sfm.get_support()

# # Reset the threshold till the number of features equals two.

# # Note that the attribute can be set directly instead of repeatedly

# # fitting the metatransformer.

# while n_features > 2:

# sfm.threshold += 0.01

# choose = sfm.get_support()

# X_transform = sfm.transform(X_train)

# n_features = X_transform.shape[1]

X_train_choose = sfm.transform(X_train)

X_test_choose = sfm.transform(X_test)

# from sklearn.linear_model import LinearRegression

# from sklearn.feature_selection import RFE

# # use linear regression as the model

# lr = LinearRegression()

# # rank all features, i.e continue the elimination until the last one

# rfe = RFE(lr, n_features_to_select=5)

# rfe.fit(X_train, y_train)

# print("Features sorted by their rank:")

# print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), feature_name)))

# select2 = rfe.ranking_

# priority2 = [i[0] for i in sorted(enumerate(select2), key=lambda x:x[1],reverse=False)] # ascening order

# Nchoose = 10

# priority_use = priority2

# X_train_choose = X_train[:,priority_use[:Nchoose]]

# X_test_choose = X_test[:,priority_use[:Nchoose]]

# run SVM again on the choosen features

clf = SVC(kernel='rbf').fit(X_train_choose, y_train)

# evaluate on the test data

y_pred = clf.predict(X_test_choose)

accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

print(accuracy)

######################################

# PCA and then apply classification

from sklearn.decomposition import PCA, KernelPCA

pca = PCA(n_components=12)

pca.fit(X_train)

print(pca.explained_variance_ratio_)

# map to low-dimensional space

X_train_transform = pca.fit_transform(X_train) #Fit the model with X and apply the dimensionality reduction on X.

X_test_transform = pca.fit_transform(X_test)

# run SVM again on the choosen features

clf = SVC(kernel='rbf').fit(X_train_transform, y_train)

# evaluate on the test data

y_pred = clf.predict(X_test_transform)

accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

print(accuracy)