In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
rek_k_tst2 = "./data/titanic/rek_test_2.csv"
oma_trn = "./data/titanic/oma_trn.csv"
oma_tst = "./data/titanic/oma_tst.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(kaggle_trn)
k_tst = pd.read_csv(rek_k_tst2)

In [None]:
# a wee reminder
k_trn.info()

In [None]:
# let's check test dataset for missing values
k_tst.info()

In [None]:
# Ok let's first sort out missing embarked values
display(k_trn.loc[k_trn['Embarked'].isnull()])
# searching suitable sources, turns out both embarked at Southhampton
# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html
# https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
k_trn.loc[k_trn['Embarked'].isnull(), 'Embarked'] = 'S'

In [None]:
# let's deal with that
display(k_tst.loc[k_tst['Fare'].isnull()])
# will use avg 3 class fare for passenger without any SibSp or Parch,
# not quite correct as we are not ignoring fares for multipassenger tickets
avg_fare_3 = k_trn[((k_trn['Pclass'] == 3) & 
 (k_trn['SibSp'] == 0) & (k_trn['Parch'] == 0))][['Fare']].mean().values[0]
print(avg_fare_3)
mask = k_tst['PassengerId'] == 1044 
k_tst.loc[mask, 'Fare'] = avg_fare_3

In [None]:
# I want to save these two dataset to CSVs so don't have to redo these few fixes
# Not going to currently save imputed ages, as may use different methods to impute missing ages
k_trn.to_csv(oma_trn, index=False)
dTN_tst = k_tst.drop(columns="TName")
dTN_tst.to_csv(oma_tst, index=False)

Now, let's sort the missing age values. There are a few options. So, I am currently thinking of trying a couple of them and seeing how the model used earlier performs with the differing value estimates.

But first, let's write a function to do the testing on our various approaches to modifying the datasets for the missing age values.

In [None]:
def trn_tst(X_trn, y_trn, X_tst):
 model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
 model.fit(X_trn, y_trn)
 predictions = model.predict(X_tst)
 # print(predictions[:5])
 # display(ds_tst["Survived"].head())
 # return accuracy_score(ds_tst["Survived"], predictions)
 return predictions

In [None]:
# start by replacing NaN with average passenger age (base case?)
r_feats = ['Pclass','Sex','SibSp','Parch','Age','Embarked','Fare']
d_feats = ['Pclass','SibSp','Parch','Age','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']
y_trn = k_trn['Survived']
Xd_age = pd.get_dummies(k_trn[r_feats])
Xd_t_age = pd.get_dummies(k_tst[r_feats])
#print(Xd_age.iloc[:2, :])

In [None]:
imputer = SimpleImputer(missing_values=np.nan)
X_age = imputer.fit_transform(Xd_age)
X_age = pd.DataFrame(X_age, columns=d_feats)
# X_age.info()
X_t_age = imputer.transform(Xd_t_age)
X_t_age = pd.DataFrame(X_t_age, columns=d_feats)
# X_t_age.info()
#print(X_age.columns, "\n", X_t_age.columns)
t_age_pred = trn_tst(X_age, y_trn, X_t_age)
print(f'model accuracy: {accuracy_score(k_tst["Survived"], t_age_pred)}')

In [None]:
imputer_2 = IterativeImputer(missing_values=np.nan, max_iter=25)
X_age_2 = imputer_2.fit_transform(Xd_age)
X_age_2 = pd.DataFrame(X_age, columns=d_feats)
X_t_age_2 = imputer_2.transform(Xd_t_age)
X_t_age_2 = pd.DataFrame(X_t_age, columns=d_feats)
#print(X_age.columns, X_t_age.columns)
t_age_pred_2 = trn_tst(X_age_2, y_trn, X_t_age_2)
print(f'model accuracy: {accuracy_score(k_tst["Survived"], t_age_pred_2)}')

Surprised Simple and Iterative both produced the same result. May have to dig deeper.

In [None]:
#X_age_2['Age'].head()
imps = pd.DataFrame(X_age_2['Age'].to_list(), columns=['Iterative'])
#imps.head()
imps2 = pd.concat([imps, X_age['Age']], axis=1)
# imps2.groupby(['Iterative', 'Age']).ngroups
imps2[(imps2['Iterative'] == imps2['Age'])].count()
imps2[(imps2['Iterative'] != imps2['Age'])]

Now, let's look a imputing with our own code and approach.

In [None]:
sns.set(font_scale=1.5)
feature_list = ['Pclass', 'Age','Sex_female','Sex_male', 'SibSp', 'Parch']
hm = plt.figure(figsize=(10,6))
g = sns.heatmap(Xd_age[feature_list].corr(), square=True, annot=True, cmap='coolwarm', fmt='.2f')

In [None]:
plt.style.use('seaborn-whitegrid')

fig = plt.figure(figsize=(14, 5))
top = fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.5)
ax = fig.add_subplot(1, 3, 1)
ax = sns.boxplot(x='Pclass', y='Age', data=k_trn)
ax = fig.add_subplot(1, 3, 2)
ax = sns.boxplot(x='Parch', y='Age', data=k_trn)
ax = fig.add_subplot(1, 3, 3)
ax = sns.boxplot(x='SibSp', y='Age', data=k_trn)


In [None]:
median_ages = k_trn.groupby(['Pclass', 'Parch', 'SibSp'], as_index=False).Age.median()
# median_ages.head()
pc = 3
pa = 2
ss = 3
median_ages[(median_ages['Pclass']==pc) & (median_ages['Parch']==pa) & (median_ages['SibSp']==ss)].Age.item()
print(f"Number of missing Age values: {median_ages['Age'].isnull().sum()}")
median_ages[(median_ages['Pclass']==3) & (median_ages['Parch']==2) & (median_ages['SibSp']==8)].Age.item()

In [None]:
k_trn[(k_trn['Pclass']==3) & (k_trn['Parch']==2) & (k_trn['SibSp']==8)]

In [None]:
X_age_3 = Xd_age
X_t_age_3 = Xd_t_age
miss_trn = list(X_age_3['Age'][X_age_3['Age'].isnull()].index)
miss_tst = list(X_t_age_3['Age'][X_t_age_3['Age'].isnull()].index)

# print(miss_age[:5])
for i in miss_trn:
 age_est = median_ages[(median_ages['Pclass']==X_age_3.iloc[i]['Pclass'])
 & (median_ages['Parch']==X_age_3.iloc[i]['Parch'])
 & (median_ages['SibSp']==X_age_3.iloc[i]['SibSp'])].Age.item()
 if pd.isnull(age_est):
 # print(f"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}")
 X_age_3.loc[i, 'Age'] = 25
 else:
 X_age_3.loc[i, 'Age'] = math.ceil(age_est)

for i in miss_tst:
 # print(f"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}")
 if not median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])
 & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])
 & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].empty:

 age_est = median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])
 & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])
 & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].Age.item()
 if pd.isnull(age_est):
 # print(f"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}")
 X_t_age_3.loc[i, 'Age'] = 15
 else:
 X_t_age_3.loc[i, 'Age'] = math.ceil(age_est)
 else:
 X_t_age_3.loc[i, 'Age'] = 15

print(f"Number of missing Age training values: {X_age_3['Age'].isnull().sum()}")
print(f"Number of missing Age test values: {X_t_age_3['Age'].isnull().sum()}")

In [None]:
t_age_pred_3 = trn_tst(X_age_3, y_trn, X_t_age_3)
print(f'model accuracy: {accuracy_score(k_tst["Survived"], t_age_pred_3)}')

In [None]:
median_ages_2 = k_trn.groupby(['Pclass', 'SibSp'], as_index=False).Age.median()
# median_ages.head()
pc = 3
ss = 3
median_ages_2[(median_ages_2['Pclass']==pc) & (median_ages_2['SibSp']==ss)].Age.item()
print(f"Number of missing Age values: {median_ages_2['Age'].isnull().sum()}")
median_ages_2[(median_ages_2['Pclass']==3) & (median_ages_2['SibSp']==8)].Age.item()

In [None]:
X_age_4 = Xd_age
X_t_age_4 = Xd_t_age
miss_trn = list(X_age_4['Age'][X_age_4['Age'].isnull()].index)
miss_tst = list(X_t_age_4['Age'][X_t_age_4['Age'].isnull()].index)

# print(miss_age[:5])
for i in miss_trn:
 age_est = median_ages_2[(median_ages_2['Pclass']==X_age_4.iloc[i]['Pclass'])
 & (median_ages_2['Parch']==X_age_4.iloc[i]['Parch'])
 & (median_ages_2['SibSp']==X_age_4.iloc[i]['SibSp'])].Age.item()
 if pd.isnull(age_est):
 # print(f"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}")
 X_age_4.loc[i, 'Age'] = 25
 else:
 X_age_4.loc[i, 'Age'] = math.ceil(age_est)

for i in miss_tst:
 # print(f"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}")
 if not median_ages_2[(median_ages_2['Pclass']==X_t_age_4.iloc[i]['Pclass'])
 & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])
 & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].empty:

 age_est = median_ages_2[(median_ages['Pclass']==X_t_age_4.iloc[i]['Pclass'])
 & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])
 & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].Age.item()
 if pd.isnull(age_est):
 # print(f"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}")
 X_t_age_4.loc[i, 'Age'] = 15
 else:
 X_t_age_4.loc[i, 'Age'] = math.ceil(age_est)
 else:
 X_t_age_4.loc[i, 'Age'] = 15

print(f"Number of missing Age training values: {X_age_4['Age'].isnull().sum()}")
print(f"Number of missing Age test values: {X_t_age_4['Age'].isnull().sum()}")

In [None]:
t_age_pred_4 = trn_tst(X_age_4, y_trn, X_t_age_4)
print(f'model accuracy: {accuracy_score(k_tst["Survived"], t_age_pred_4)}')