In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import FeatureUnion, make_pipeline

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
oma_trn_3 = "./data/titanic/oma_trn_3.csv"
oma_tst_3 = "./data/titanic/oma_tst_3.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(oma_trn_3)
k_tst = pd.read_csv(oma_tst_3)
k_all = k_trn
k_all = pd.concat([k_all, k_tst], ignore_index=True)

In [None]:
# will use iterativeimputer in pipeline to fill in missing ages
# may also try KNNImputer in future
transformer = FeatureUnion(
  transformer_list=[
    ('features', IterativeImputer(max_iter=10, random_state=0)),
    ('indicators', MissingIndicator())])
clf = make_pipeline(transformer, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))

In [None]:
y_trn = k_trn['Survived']

features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']
X_trn = pd.get_dummies(k_trn[features])
X_test = pd.get_dummies(k_tst[features])

In [None]:
trn_cols = X_trn.columns.tolist()
trn_cols.append("AgeMissing")
tst_cols = X_test.columns.tolist()
tst_cols.append("AgeMissing")

In [None]:
clf = clf.fit(X_trn, y_trn)
preds = clf.predict(X_test)
accuracy_score(k_tst["Survived"], preds)

In [None]:
# want to have a look at the intermediate/transformed data for training and prediction
X_trn.tail()
x_intermediate = X_trn
for step in clf.steps[:-1]:
    x_intermediate = step[1].transform(x_intermediate)
    x_int_trn_trans = pd.DataFrame(x_intermediate, columns=trn_cols)
    x_int_trn_trans.tail()
x_tst_int = X_test
for step in clf.steps[:-1]:
    x_int = step[1].transform(x_tst_int)
    x_int_tst_trans = pd.DataFrame(x_int, columns=tst_cols)
    x_int_tst_trans.tail()

In [None]:
# let's check the pipeline result
trn_cols = X_trn.columns.tolist()
trn_cols.append("AgeMissing")
X_trn_trans = transformer.fit_transform(X_trn, y_trn)
X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)
tst_cols = X_test.columns.tolist()
tst_cols.append("AgeMissing")
X_tst_trans = transformer.transform(X_test)
X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_trn_trans, y_trn)
predictions = model.predict(X_tst_trans)
accuracy_score(k_tst["Survived"], predictions)

In [None]:
# let's try the KNNimputer
transformer_2 = FeatureUnion(
  transformer_list=[
    ('features', KNNImputer(n_neighbors=5)),
    ('indicators', MissingIndicator())])
clf_2 = make_pipeline(transformer_2, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))

In [None]:
clf_2 = clf_2.fit(X_trn, y_trn)
preds_2 = clf_2.predict(X_test)
accuracy_score(k_tst["Survived"], preds_2)

In [None]:
# let's try a few values for n_neighbors0
for kn in [2, 4, 6, 8, 10, 12]:
  transformer_3 = FeatureUnion(
    transformer_list=[
      ('features', KNNImputer(n_neighbors=kn)),
      ('indicators', MissingIndicator())])
  clf_3 = make_pipeline(transformer_3, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))
  clf_3 = clf_3.fit(X_trn, y_trn)
  preds_3 = clf_3.predict(X_test)
  score = accuracy_score(k_tst["Survived"], preds_3)
  print(f"k_neighbors={kn} -> {score}")