In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
rek_k_tst2 = "./data/titanic/rek_test_2.csv"
oma_trn = "./data/titanic/oma_trn.csv"
oma_tst = "./data/titanic/oma_tst.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(oma_trn)
k_tst = pd.read_csv(oma_tst)

In [None]:
k_trn[(k_trn['Ticket'] == 'LINE')]
k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701') | (k_tst['Ticket'] == '392095')]

In [None]:
k_trn[(k_trn['Fare'] == 0.0)]
k_tst[(k_tst['Fare'] == 0.0)]

In [None]:
k_trn[(k_trn["Ticket"].str.contains('2398'))]

In [None]:
k_trn[(k_trn["Ticket"].str.contains('11205'))]
k_tst[(k_tst["Ticket"].str.contains('11205'))]

In [None]:
k_trn[(k_trn["Ticket"].str.contains('1997'))]

In [None]:
# that didn't do much, let's try '199'
k_trn[(k_trn["Ticket"].str.startswith('199')) & (k_trn["Pclass"] == 1) & (k_trn["SibSp"] == 0) & (k_trn["Parch"] == 0)]

In [None]:
# let's dig a little further into that group with tickets'11205..'

In [None]:
k_trn[(k_trn["Cabin"].str.startswith('B')) & (k_trn["Pclass"] == 1) & (k_trn["SibSp"] == 0) & (k_trn["Parch"] == 0)].sort_values(['Ticket', 'Cabin'])
k_tst[(k_tst["Cabin"].str.startswith('B')) & (k_tst["Pclass"] == 1) & (k_tst["SibSp"] == 0) & (k_tst["Parch"] == 0)].sort_values(['Ticket', 'Cabin'])

In [None]:
k_trn[(~k_trn["Cabin"].isnull()) & (k_trn["Cabin"].str.contains(' '))].sort_values(['Ticket','Cabin'])

In [None]:
k_trn[(k_trn["Pclass"] == 1) & (k_trn["Fare"] > 0)].sort_values(['Fare'])


In [None]:
k_trn[(k_trn["Pclass"] == 1) & (k_trn["Fare"] > 0) & (k_trn["Cabin"].str.startswith('B'))].sort_values(['Fare']).head(10)

In [None]:
k_trn[(~k_trn["Cabin"].isnull()) & (k_trn["Cabin"].str.contains('B9'))].sort_values(['Ticket','Cabin'])

In [None]:
# based on discussion in related post, let's update the zero fares
# start with the Storey group
g1a = list(k_trn[(k_trn['Ticket'] == 'LINE')].index)
g1b = list(k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701')].index)
print(g1a, g1b)
#print(k_tst.iloc[g1b[0]])
nbr_p = len(g1a) + len(g1b)
base_f = 7.25
grp_f = base_f * nbr_p
def_t = "370160"
for i in g1a:
 k_trn.loc[i, "Ticket"] = def_t
 k_trn.loc[i, "Fare"] = grp_f
for i in g1b:
 k_tst.loc[i, "Ticket"] = def_t
 k_tst.loc[i, "Fare"] = grp_f
k_trn[k_trn["Ticket"] == def_t]
k_tst[k_tst["Ticket"] == def_t]

In [None]:
# now the group with tickets beginning with 2398
g2 = list(k_trn[(k_trn["Ticket"].str.contains('2398')) & (k_trn["Fare"] == 0.0)].index)
print(g2)
base_f = 13.00
g3_f = base_f * 3
g3_t = '239853'
for i in g2:
 if k_trn.iloc[i].Ticket == g3_t:
 k_trn.loc[i, "Fare"] = g3_f
 else:
 k_trn.loc[i, "Fare"] = base_f
k_trn[(k_trn["Ticket"].str.contains('2398'))]

In [None]:
# now Mr. Reuchlin’s case
mask = k_trn['Name'].str.startswith("Reuchlin,")
k_trn.loc[mask, 'Fare'] = 30.5
k_trn[mask]

In [None]:
# and our last group including the head of the White Star Line
boss = k_tst['Name'].str.startswith("Ismay,")
valet = k_trn['Name'].str.startswith("Fry,")
secretary = k_trn['Ticket'].str.startswith("112059")
others1 = list(k_trn[k_trn["Ticket"].isin(['112052', '112050'])].index)
others2 = list(k_tst[k_tst["Ticket"] == "112051"].index)
k_tst.loc[boss, "Fare"] = 512.33
k_trn.loc[valet, "Ticket"] = "112058A"
k_trn.loc[valet, "Fare"] = 30.00
k_trn.loc[secretary, "Fare"] = 60.00
for i in others1:
 k_trn.loc[i, "Fare"] = 30.00
for i in others2:
 k_tst.loc[i, "Fare"] = 30.00
k_trn[(k_trn["Ticket"].str.contains('11205'))]
k_tst[(k_tst["Ticket"].str.contains('11205'))]

In [None]:
# let's double check we have no more zero or nan fares
k_trn[(k_trn["Fare"].isnull()) | (k_trn["Fare"] == 0.0)]

In [None]:
# okay let's save the changes
oma_trn_2 = "./data/titanic/oma_trn_2.csv"
oma_tst_2 = "./data/titanic/oma_tst_2.csv"
k_trn.to_csv(oma_trn_2, index=False)
k_tst.to_csv(oma_tst_2, index=False)

In [None]:
# let's see if adding Fare to our basic set of features improves our score
Y = k_trn['Survived']

features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']
X = pd.get_dummies(k_trn[features])
X_test = pd.get_dummies(k_tst[features])
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, Y)
predictions = model.predict(X_test)
accuracy_score(k_tst["Survived"], predictions)