In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
rek_k_tst2 = "./data/titanic/rek_test_2.csv"

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(kaggle_trn)
k_tst = pd.read_csv(rek_k_tst2)

In [None]:
print(k_trn.columns)

In [None]:
k_trn.info()

In [None]:
# looks to be missing data in the training dataset, what about the test dataset
#sns.color_palette("Set2")
fig, ax = plt.subplots(figsize=(5,5))
o_ttl = plt.title("Training Dataset: Missing Values")
o_map = sns.heatmap(k_tst.isnull(), cbar=False, cmap="rocket_r")

In [None]:
# gender vs survival
k_trn.groupby('Sex').Survived.mean()

In [None]:
# or visually
srvvd = k_trn[k_trn["Survived"] == 1]["Sex"].value_counts()
not_s = k_trn[k_trn["Survived"] == 0]["Sex"].value_counts()
df_s_no = pd.DataFrame([srvvd, not_s])
df_s_no.index = ['Survived', 'Did Not']
ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))

In [None]:
# age vs survival
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Age Distribution")
p_plt = sns.histplot(k_trn["Age"], ax=ax, bins=30, kde=True, color='b')

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Age Distribution by Survival Status")
p_x = ax.set_xlabel("Age")
p_p1 = sns.kdeplot(k_trn["Age"].loc[k_trn["Survived"] == 1], ax=ax, label='Survived', shade=True)
p_p2 = sns.kdeplot(k_trn["Age"].loc[k_trn["Survived"] == 0], ax=ax, label='Did Not', shade=True)

In [None]:
# not much info, try something else
fig, ax = plt.subplots(figsize=(15,5))
ax.grid(True)
p_ttl = ax.set_title("Survival by Age and Gender")
p_xtk = plt.xticks(list(range(0,100,2)))
p_p2 = sns.swarmplot(y="Sex", x="Age", data=k_trn, hue="Survived")

In [None]:
# let's checkout Pclass
k_trn.groupby(['Pclass']).Survived.mean().to_frame()

In [None]:
# how about a chart
# yes, I know, should have created a function
srvvd = k_trn[k_trn["Survived"] == 1]["Pclass"].value_counts()
not_s = k_trn[k_trn["Survived"] == 0]["Pclass"].value_counts()
df_s_no = pd.DataFrame([srvvd, not_s])
df_s_no.index = ['Survived', 'Did Not']
ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))

In [None]:
# to help understand the above, let's look at the numbers
pd.pivot_table(k_trn, index='Survived', columns='Pclass', values='Ticket', aggfunc='count')

In [None]:
k_trn.groupby(['Pclass', 'Sex']).Survived.mean().to_frame()

In [None]:
# let's also look at Pclass vs Age
# yes another function needed
fig, ax = plt.subplots(figsize=(15,5))
ax.grid(True)
p_ttl = ax.set_title("Survival by Age and Pclass")
p_xtk = plt.xticks(list(range(0,100,2)))
p_p2 = sns.swarmplot(y="Age", x="Pclass", data=k_trn, hue="Survived")

In [None]:
# how about point of embarkation
k_trn['Embarked'].value_counts().to_frame()
pd.pivot_table(k_trn, index="Survived", columns="Embarked", values="Ticket", aggfunc="count")

In [None]:
# why high survival rate at Cherbourg and lowest at Southampton
# due to numbers of first class and/or third class passengers embarking at those points?
#k_trn.groupby(['Embarked', 'Pclass']).Survived.sum().to_frame()
def count_zeros(s):
 return s.size - s.sum()
k_trn.groupby(['Embarked', 'Pclass']).agg(
 Survived=pd.NamedAgg(column='Survived', aggfunc='sum'),
 DidNot=pd.NamedAgg(column='Survived', aggfunc=count_zeros)
)

In [None]:
# did gender balance make difference in survival rates Queenstown vs Southampton
fig, ax = plt.subplots(figsize=(5,5))
p_ttl = ax.set_title("Count by Embarcation Point and Class")
p_p = sns.countplot(x="Embarked", data=k_trn, ax=ax, hue="Sex")