In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
rek_k_tst2 = "./data/titanic/rek_test_2.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(kaggle_trn)
k_tst = pd.read_csv(rek_k_tst2)

In [None]:
# a wee reminder
k_trn.info()

In [None]:
# did the fare paid influence the survival rate
# let's look at the fare feature info
k_trn.Fare.describe()

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Fare Distribution")
p_plt = sns.histplot(k_trn["Fare"], ax=ax, bins=50, kde=True, color='b')

In [None]:
# categorize fares and plot against survival - let's just use the quartiles for now
f_cats = ['Lowest', "Medium", "High", "Highest"]
f_rngs = pd.qcut(k_trn["Fare"], len(f_cats), labels=f_cats)
fig, ax = plt.subplots(figsize=(5, 5))
p_ttl = ax.set_title("Survival vs Fare")
p_y = ax.set_ylabel("Survival Rate")
p_plt = sns.barplot(x=f_rngs, y=k_trn.Survived, ax=ax, ci=None)

In [None]:
# let's add gender to the equations
fig, ax = plt.subplots(figsize=(20,8))
ax.grid(True)
p_ttl = ax.set_title("Survival by Fare and Gender")
p_xtk = plt.xticks(list(range(0,100,2)))
p_p2 = sns.swarmplot(y="Fare", x="Sex", data=k_trn, hue="Survived", size=5)

In [None]:
# let's have a look at SibSp and Parch features
fig, ax = plt.subplots(figsize=(5, 5))
p_ttl = ax.set_title("Survival vs Siblings/Spouses Aboard")
p_y = ax.set_ylabel("Survival Rate")
p_plt = sns.barplot(x="SibSp", y="Survived", data=k_trn, ax=ax, ci=None)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
p_ttl = ax.set_title("Survival vs Parents/Children Aboard")
p_y = ax.set_ylabel("Survival Rate")
p_plt = sns.barplot(x="Parch", y="Survived", data=k_trn, ax=ax, ci=None)

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Survival Count by Siblings/Spouses Onboard")
p_p = sns.countplot(x="SibSp", data=k_trn, ax=ax, hue="Survived")

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Survival Count by Parents/Children Onboard")
p_p = sns.countplot(x="Parch", data=k_trn, ax=ax, hue="Survived")

In [None]:
# what if we combine SibSp and Parch to get "family size", adding one for the passenger themselves
k_trn['FamilySize'] = k_trn['Parch'] + k_trn['SibSp'] + 1

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Survival Count by Family Size Onboard")
p_p = sns.countplot(x="FamilySize", data=k_trn, ax=ax, hue="Survived")

In [None]:
k_trn["Cabin"].describe()

In [None]:
k_trn["Ticket"].describe()

In [None]:
k_trn["Name"].describe()