In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import FeatureUnion, make_pipeline

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
oma_trn_3 = "./data/titanic/oma_trn_3.csv"
oma_tst_3 = "./data/titanic/oma_tst_3.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(oma_trn_3)
k_tst = pd.read_csv(oma_tst_3)
k_all = k_trn.copy()
k_all = pd.concat([k_all, k_tst], ignore_index=True)

In [None]:
# let's bin the age data and have a look
k_all['AgeRng'] = pd.cut(k_all['Age'], bins=range(0, 90, 5))
sns.set(rc={'figure.figsize':(12,8)})
sns.set(font_scale=1.0)
# plt.style.use('seaborn-whitegrid')
g = sns.barplot(x='AgeRng', y='Survived', data=k_all)

table = pd.crosstab(k_all['AgeRng'], k_all['Survived'])
print('\n', table)

- 0-15: survival pretty good
- 30-40: looks to be a slight increase in survival rate
- 60+: survival rate looks to decline

So, looking at using following ranges: ['0-15', '16-29', '30-40', '41-59', '60+']

In [None]:
bin_thresholds = [0, 15, 30, 40, 59, 90]
bin_labels = ['0-15', '16-29', '30-40', '41-59', '60+']
k_trn['AgeBin'] = pd.cut(k_trn['Age'], bins=bin_thresholds, labels=bin_labels)
k_tst['AgeBin'] = pd.cut(k_tst['Age'], bins=bin_thresholds, labels=bin_labels)

In [None]:
k_trn.tail()

In [None]:
k_tst.tail()

In [None]:
k_trn.to_csv(oma_trn_3, index=False)
k_tst.to_csv(oma_tst_3, index=False)