In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import FeatureUnion, make_pipeline

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
oma_trn_3 = "./data/titanic/oma_trn_3.csv"
oma_tst_3 = "./data/titanic/oma_tst_3.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(oma_trn_3)
k_tst = pd.read_csv(oma_tst_3)
k_all = k_trn
k_all = pd.concat([k_all, k_tst], ignore_index=True)

In [None]:
y_trn = k_trn['Survived']

# start fresh
features = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age', 'Title']
X_trn = k_trn[features].copy()
X_tst = k_tst[features].copy()
# let's load the Kaggle datasets so we can get the original Age data
kg_trn = pd.read_csv(kaggle_trn)
kg_tst = pd.read_csv(kaggle_tst)

In [None]:
# now let's replace the Age data in my versions of the datasets with that from the Kaggle datasets
# for test will add kaggle column as well
X_trn.rename(columns={"Age": "iiAge"}, inplace=True)
X_trn["Age"] = kg_trn["Age"]
X_tst.rename(columns={"Age": "iiAge"}, inplace=True)
X_tst["Age"] = kg_tst["Age"]

In [None]:
X_trn.head()
X_trn.tail()
X_trn["Age"].describe()

In [None]:
mean_trn = k_trn[k_trn["Title"] == "Master"]["Age"].mean()
mean_tst = k_tst[k_tst["Title"] == "Master"]["Age"].mean()
cnt_trn = k_trn[k_trn["Title"] == "Master"]["Title"].count()
cnt_tst = k_tst[k_tst["Title"] == "Master"]["Title"].count()
mean_all = ((mean_trn * cnt_trn) + (mean_tst * cnt_tst)) / (cnt_trn + cnt_tst)
print(f"Mean Age for 'Master': training set {mean_trn:.3f} ({cnt_trn}), test set {mean_tst:.3f} ({cnt_tst}) -> {mean_all:.3f}")

X_trn[(X_trn["Title"] == "Master") & (X_trn["Age"].isnull())].head()
X_tst[(X_tst["Title"] == "Master") & (X_tst["Age"].isnull())].head()

In [None]:
X_trn.loc[(X_trn["Title"] == "Master") & (X_trn["Age"].isnull()), "Age"] = mean_all
X_tst.loc[(X_tst["Title"] == "Master") & (X_tst["Age"].isnull()), "Age"] = mean_all

In [None]:
# let's check
pid_trn = [66, 160, 177, 710]
pid_tst = [1136, 1231, 1236, 1309]
X_trn[X_trn["PassengerId"].isin(pid_trn)]
X_tst[X_tst["PassengerId"].isin(pid_tst)]

In [None]:
# okay, not on to the IterativeImputer
min_age = min(X_trn["Age"].min(), X_tst["Age"].min())
max_age = max(X_trn["Age"].max(), X_tst["Age"].max())
print(min_age, max_age)
transformer = FeatureUnion(
  transformer_list=[
    ('features', IterativeImputer(max_iter=10, min_value=min_age, max_value=max_age, random_state=0)),
    ('indicators', MissingIndicator())])

In [None]:
features = ["PassengerId", "Pclass", "Sex", "SibSp", "Parch",	"Title", "Age"]
X_trn = pd.get_dummies(X_trn[features])
X_tst = pd.get_dummies(X_tst[features])

In [None]:
X_trn.head()

In [None]:
# let's train, and transform, our imputer on X_trn, and have look
trn_cols = X_trn.columns.tolist()
trn_cols.append("AgeMissing")
X_trn_trans = transformer.fit_transform(X_trn, y_trn)
X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)

In [None]:
disp_cols = ["PassengerId", "Pclass", "Sex_female", "Sex_male", "SibSp", "Parch", "Age"]
# X_trn_trans[disp_cols].tail()
X_trn_trans[disp_cols].describe()

In [None]:
# looks better, do the same for X_tst
tst_cols = X_tst.columns.tolist()
tst_cols.append("AgeMissing")
X_tst_trans = transformer.transform(X_tst)
X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)

In [None]:
disp_cols = ["PassengerId", "Pclass", "Sex_female", "Sex_male", "SibSp", "Parch", "Age"]
# X_trn_trans[disp_cols].tail()
X_tst_trans[disp_cols].describe()

In [None]:
# new updated training dataset dataframe
k_trn_2 = k_trn.copy()
k_trn_2 = k_trn_2.drop("AgeMissing", axis=1)
k_trn_2[:].Age = X_trn_trans[:].Age
# k_trn_2[:].AgeMissing = X_trn_trans[:].AgeMissing
k_trn_2 = pd.concat([k_trn_2, X_trn_trans[:].AgeMissing], axis=1)
# new updated testing dataset dataframe
k_tst_2 = k_tst.copy()
k_tst_2 = k_tst_2.drop("AgeMissing", axis=1)
k_tst_2[:].Age = X_tst_trans[:].Age
# k_tst_2[:].AgeMissing = X_tst_trans[:].AgeMissing
k_tst_2 = pd.concat([k_tst_2, X_tst_trans[:].AgeMissing], axis=1)

In [None]:
k_trn_2.describe()

In [None]:
k_tst_2.describe()

In [None]:
k_trn_2.info()

In [None]:
# glad I did that, almost forgot to update AgeBin again
bin_thresholds = [0, 15, 30, 40, 59, 90]
bin_labels = ['0-15', '16-29', '30-40', '41-59', '60+']
k_trn_2['AgeBin'] = pd.cut(k_trn['Age'], bins=bin_thresholds, labels=bin_labels)
k_tst_2['AgeBin'] = pd.cut(k_tst['Age'], bins=bin_thresholds, labels=bin_labels)

In [None]:
k_trn_2.info()

In [None]:
k_tst_2.info()

In [None]:
# save updated datasets to our CSV files
k_trn_2.to_csv(oma_trn_3, index=False)
k_tst_2.to_csv(oma_tst_3, index=False)