In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import FeatureUnion, make_pipeline

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
oma_trn_3 = "./data/titanic/oma_trn_3.csv"
oma_tst_3 = "./data/titanic/oma_tst_3.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(oma_trn_3)
k_tst = pd.read_csv(oma_tst_3)
k_all = k_trn
k_all = pd.concat([k_all, k_tst], ignore_index=True)

# skip work done previously to impute missing ages and save new CSV
# execute cell or not
do_cell = False

In [None]:
# will use iterativeimputer in pipeline to fill in missing ages
# may also try KNNImputer in future
transformer = FeatureUnion(
 transformer_list=[
 ('features', IterativeImputer(max_iter=10, random_state=0)),
 ('indicators', MissingIndicator())])
clf = make_pipeline(transformer, RandomForestClassifier())

In [None]:
y_trn = k_trn['Survived']

features = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age']
X_trn = pd.get_dummies(k_trn[features])
X_tst = pd.get_dummies(k_tst[features])

In [None]:
if do_cell:
 trn_cols = X_trn.columns.tolist()
 trn_cols.append("AgeMissing")
 X_trn_trans = transformer.fit_transform(X_trn, y_trn)
 X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)
 tst_cols = X_tst.columns.tolist()
 tst_cols.append("AgeMissing")
 X_tst_trans = transformer.transform(X_tst)
 X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)

In [None]:
if do_cell:
 X_trn_trans.tail()
 X_tst_trans.tail()

In [None]:
if do_cell:
 k_trn.head(2)

In [None]:
if do_cell:
 print(X_trn_trans.iloc[0].PassengerId == k_trn.iloc[0].PassengerId)

In [None]:
if do_cell:
 k_trn_2 = k_trn.copy()
 k_trn_2[:].Age = X_trn_trans[:].Age
 k_trn_2 = pd.concat([k_trn_2, X_trn_trans[:].AgeMissing], axis=1)

In [None]:
if do_cell:
 k_tst_2 = k_tst.copy()
 k_tst_2[:].Age = X_tst_trans[:].Age
 k_tst_2 = pd.concat([k_tst_2, X_tst_trans[:].AgeMissing], axis=1)

In [None]:
if do_cell:
 k_trn_2.to_csv(oma_trn_3, index=False)
 k_tst_2.to_csv(oma_tst_3, index=False)

In [None]:
if do_cell:
 # reload the updated datasets and see if any missing Age data
 k_trn = pd.read_csv(oma_trn_3)
 k_tst = pd.read_csv(oma_tst_3)
 k_trn.info()
 k_tst.info()

Discovered a bit of problem when working with 'Age' when working on a future post/notebook. There were negative ages?

Apparently something that is possible with the IterativeImputer.

Let's have a look.

In [None]:
k_trn['Age'].describe()

Sure enough. Should have done that when I was originally working on this post/notebook. Would have saved myself some grief.

Was just going to use a backup of the CSV files and redo the post/notebook with hopefully the correct result. But, decided to fix without using the back up. Just the current CSV and the original Kaggle CSVs. Expect it might get messy.

In [None]:
# start fresh
features = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age']
X_trn = pd.get_dummies(k_trn[features])
X_tst = pd.get_dummies(k_tst[features])
# let's load the Kaggle datasets so we can get the original Age data
kg_trn = pd.read_csv(kaggle_trn)
kg_tst = pd.read_csv(kaggle_tst)

In [None]:
# now let's replace the Age data in my versions of the datasets with that from the Kaggle datasets
# for test will add kaggle column as well
X_trn.rename(columns={"Age": "iiAge"}, inplace=True)
X_trn["Age"] = kg_trn["Age"]

In [None]:
X_trn.head()
X_trn.tail()
X_trn["Age"].describe()

In [None]:
# let's things out before deleting the old column
X_trn.loc[(X_trn['iiAge'].ne(X_trn['Age'])) & (X_trn['Age'].notna())]

In [None]:
# get rid of iiAge column in X_trn
X_trn.drop('iiAge', axis=1)

In [None]:
# let's do same for test data set
X_tst.rename(columns={"Age": "iiAge"}, inplace=True)
X_tst["Age"] = kg_tst["Age"]
X_tst.loc[(X_tst['iiAge'].ne(X_tst['Age'])) & (X_tst['Age'].notna())]

In [None]:
X_tst.drop('iiAge', axis=1)

In [None]:
# now see if we can fix that imputer
min_age = min(X_trn["Age"].min(), X_tst["Age"].min())
max_age = max(X_trn["Age"].max(), X_tst["Age"].max())
print(min_age, max_age)
transformer = FeatureUnion(
 transformer_list=[
 ('features', IterativeImputer(max_iter=10, min_value=min_age, max_value=max_age, random_state=0)),
 ('indicators', MissingIndicator())])
clf = make_pipeline(transformer, RandomForestClassifier())

In [None]:
# let's train, and transform, our imputer on X_trn, and have look
trn_cols = X_trn.columns.tolist()
trn_cols.append("AgeMissing")
X_trn_trans = transformer.fit_transform(X_trn, y_trn)
X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)

In [None]:
X_trn_trans.describe()

In [None]:
# looks better, do the same for X_tst
tst_cols = X_tst.columns.tolist()
tst_cols.append("AgeMissing")
X_tst_trans = transformer.transform(X_tst)
X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)

In [None]:
X_tst_trans.describe()

In [None]:
# new updated training dataset dataframe
k_trn_2 = k_trn.copy()
k_trn_2 = k_trn_2.drop("AgeMissing", axis=1)
k_trn_2[:].Age = X_trn_trans[:].Age
# k_trn_2[:].AgeMissing = X_trn_trans[:].AgeMissing
k_trn_2 = pd.concat([k_trn_2, X_trn_trans[:].AgeMissing], axis=1)
# new updated testing dataset dataframe
k_tst_2 = k_tst.copy()
k_tst_2 = k_tst_2.drop("AgeMissing", axis=1)
k_tst_2[:].Age = X_tst_trans[:].Age
# k_tst_2[:].AgeMissing = X_tst_trans[:].AgeMissing
k_tst_2 = pd.concat([k_tst_2, X_tst_trans[:].AgeMissing], axis=1)

In [None]:
k_trn_2.describe()

In [None]:
k_tst_2.describe()

In [None]:
k_trn_2.to_csv(oma_trn_3, index=False)
k_tst_2.to_csv(oma_tst_3, index=False)