{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets currently of interest\n",
    "k_trn = pd.read_csv(kaggle_trn)\n",
    "k_tst = pd.read_csv(rek_k_tst2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a wee reminder\n",
    "k_trn.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# did the fare paid influence the survival rate\n",
    "# let's look at the fare feature info\n",
    "k_trn.Fare.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Fare Distribution\")\n",
    "p_plt = sns.histplot(k_trn[\"Fare\"], ax=ax, bins=50, kde=True, color='b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# categorize fares and plot against survival - let's just use the quartiles for now\n",
    "f_cats = ['Lowest', \"Medium\", \"High\", \"Highest\"]\n",
    "f_rngs = pd.qcut(k_trn[\"Fare\"], len(f_cats), labels=f_cats)\n",
    "fig, ax = plt.subplots(figsize=(5, 5))\n",
    "p_ttl = ax.set_title(\"Survival vs Fare\")\n",
    "p_y = ax.set_ylabel(\"Survival Rate\")\n",
    "p_plt = sns.barplot(x=f_rngs, y=k_trn.Survived, ax=ax, ci=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's add gender to the equations\n",
    "fig, ax = plt.subplots(figsize=(20,8))\n",
    "ax.grid(True)\n",
    "p_ttl = ax.set_title(\"Survival by Fare and Gender\")\n",
    "p_xtk = plt.xticks(list(range(0,100,2)))\n",
    "p_p2 = sns.swarmplot(y=\"Fare\", x=\"Sex\", data=k_trn, hue=\"Survived\", size=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's have a look at SibSp and Parch features\n",
    "fig, ax = plt.subplots(figsize=(5, 5))\n",
    "p_ttl = ax.set_title(\"Survival vs Siblings/Spouses Aboard\")\n",
    "p_y = ax.set_ylabel(\"Survival Rate\")\n",
    "p_plt = sns.barplot(x=\"SibSp\", y=\"Survived\", data=k_trn, ax=ax, ci=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5, 5))\n",
    "p_ttl = ax.set_title(\"Survival vs Parents/Children Aboard\")\n",
    "p_y = ax.set_ylabel(\"Survival Rate\")\n",
    "p_plt = sns.barplot(x=\"Parch\", y=\"Survived\", data=k_trn, ax=ax, ci=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Survival Count by Siblings/Spouses Onboard\")\n",
    "p_p = sns.countplot(x=\"SibSp\", data=k_trn, ax=ax, hue=\"Survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Survival Count by Parents/Children Onboard\")\n",
    "p_p = sns.countplot(x=\"Parch\", data=k_trn, ax=ax, hue=\"Survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what if we combine SibSp and Parch to get \"family size\", adding one for the passenger themselves\n",
    "k_trn['FamilySize'] = k_trn['Parch'] + k_trn['SibSp'] + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Survival Count by Family Size Onboard\")\n",
    "p_p = sns.countplot(x=\"FamilySize\", data=k_trn, ax=ax, hue=\"Survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[\"Cabin\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[\"Ticket\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[\"Name\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}