Data Science
Titanic code
Files:
Link to data (required to run)
test.csv
train.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
print("done")
df_train = pd.read_csv('train.csv')
df_train["train_test"] = 1 # This is training data, since 1 = training and 0 = testing
df_train.head()
df_test = pd.read_csv('test.csv')
df_test["train_test"] = 0 # This is testing data, since 1 = training and 0 = testing
df_test["Survived"] = np.NaN
df_test.head()
df_train.info()
df_train.describe()
categorical = df_train[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]
numeric = df_train[['Age','SibSp','Parch','Fare']]
for i in numeric.columns:
plt.hist(numeric[i])
plt.title(i)
plt.show()
f, ax = plt.subplots(figsize = [25,16])
sns.heatmap(numeric.corr(),linewidths = .5, annot = True, cmap = 'YlGnBu', square = True)
pd.pivot_table(df_train, index='Survived', values = ['Age','SibSp','Parch','Fare'])
for i in categorical.columns:
sns.barplot(categorical[i].value_counts().index, categorical[i].value_counts()).set_title(i)
plt.show()
print(pd.pivot_table(df_train, index = 'Survived', columns = 'Pclass', values = 'Ticket' ,aggfunc ='count'))
print(pd.pivot_table(df_train, index = 'Survived', columns = 'Sex', values = 'Ticket' ,aggfunc ='count'))
print(pd.pivot_table(df_train, index = 'Survived', columns = 'Embarked', values = 'Ticket' ,aggfunc ='count'))
df_train["cabin_letter"] = df_train.Cabin.apply(lambda x : str(x)[0])
df_train['cabin_multiple'] = df_train.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
pd.pivot_table(df_train, index = 'Survived', columns = 'cabin_multiple', values = 'Ticket' ,aggfunc ='count')
pd.pivot_table(df_train,index='Survived',columns='cabin_letter', values = 'Name', aggfunc='count')
df_train['numeric_ticket'] = df_train.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
pd.pivot_table(df_train,index='Survived',columns='numeric_ticket', values = 'Ticket', aggfunc='count')
df_train['name_title'] = df_train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
pd.pivot_table(df_train,index='Survived',columns='name_title', values = 'Ticket', aggfunc='count')
def title_groups(x):
if x == "the Countess" or x == "Sir" or x == "Mlle" or x == "Mme" or x == "Lady":
return "Royalty"
elif x == "Capt" or x == "Don" or x == "Jonkheer" or x == "Rev":
return "AlmostRoyalty"
elif x == "Col" or x == "Major":
return "Military"
else:
return x
df_train['name_title'] = df_train.name_title.apply(title_groups)
pd.pivot_table(df_train,index='Survived',columns='name_title', values = 'Ticket', aggfunc='count')
df_train['ticket_letters'] = df_train.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) >0 else 0)
pd.set_option("max_rows", 6)
df_train['ticket_letters'].value_counts()
all_data = pd.concat([df_train, df_test])
all_data['cabin_multiple'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
all_data['cabin_letter'] = all_data.Cabin.apply(lambda x: str(x)[0])
all_data['numeric_ticket'] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data['ticket_letters'] = all_data.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) >0 else 0)
all_data['name_title'] = all_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
all_data['name_title'] = all_data.name_title.apply(title_groups)
all_data.head()
all_data.Age = all_data.Age.fillna(df_train.Age.median())
all_data.Fare = all_data.Fare.fillna(df_train.Fare.median())
all_data.dropna(subset=['Embarked'],inplace = True)
pd.set_option("max_rows", None)
all_data.isnull().sum()
pd.set_option("max_rows", 6)
all_data['norm_fare'] = np.log(all_data.Fare+1)
all_data['norm_fare'].hist()
all_data.Pclass = all_data.Pclass.astype(str)## Scaling
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','SibSp','Parch','norm_fare','Embarked','cabin_letter','cabin_multiple','numeric_ticket','name_title','train_test']])
X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis =1)
X_train.shape
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis =1)
X_test.shape
y_train = all_data[all_data.train_test==1].Survived
y_train.shape
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age','SibSp','Parch','norm_fare']]= scale.fit_transform(all_dummies_scaled[['Age','SibSp','Parch','norm_fare']])
all_dummies_scaled
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)
y_train = all_data[all_data.train_test==1].Survived
print(y_train)
from sklearn.model_selection import cross_val_score
from sklearn import tree # DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn import metrics
from sklearn import linear_model # LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Winner
svc_clf = SVC(probability = True)
cv = cross_val_score(svc_clf,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())