{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets currently of interest\n",
    "k_trn = pd.read_csv(kaggle_trn)\n",
    "k_tst = pd.read_csv(rek_k_tst2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(k_trn.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# looks to be missing data in the training dataset, what about the test dataset\n",
    "#sns.color_palette(\"Set2\")\n",
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "o_ttl = plt.title(\"Training Dataset: Missing Values\")\n",
    "o_map = sns.heatmap(k_tst.isnull(), cbar=False, cmap=\"rocket_r\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# gender vs survival\n",
    "k_trn.groupby('Sex').Survived.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# or visually\n",
    "srvvd = k_trn[k_trn[\"Survived\"] == 1][\"Sex\"].value_counts()\n",
    "not_s = k_trn[k_trn[\"Survived\"] == 0][\"Sex\"].value_counts()\n",
    "df_s_no = pd.DataFrame([srvvd, not_s])\n",
    "df_s_no.index = ['Survived', 'Did Not']\n",
    "ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# age vs survival\n",
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Age Distribution\")\n",
    "p_plt = sns.histplot(k_trn[\"Age\"], ax=ax, bins=30, kde=True, color='b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Age Distribution by Survival Status\")\n",
    "p_x = ax.set_xlabel(\"Age\")\n",
    "p_p1 = sns.kdeplot(k_trn[\"Age\"].loc[k_trn[\"Survived\"] == 1], ax=ax, label='Survived', shade=True)\n",
    "p_p2 = sns.kdeplot(k_trn[\"Age\"].loc[k_trn[\"Survived\"] == 0], ax=ax, label='Did Not', shade=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# not much info, try something else\n",
    "fig, ax = plt.subplots(figsize=(15,5))\n",
    "ax.grid(True)\n",
    "p_ttl = ax.set_title(\"Survival by Age and Gender\")\n",
    "p_xtk = plt.xticks(list(range(0,100,2)))\n",
    "p_p2 = sns.swarmplot(y=\"Sex\", x=\"Age\", data=k_trn, hue=\"Survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's checkout Pclass\n",
    "k_trn.groupby(['Pclass']).Survived.mean().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# how about a chart\n",
    "# yes, I know, should have created a function\n",
    "srvvd = k_trn[k_trn[\"Survived\"] == 1][\"Pclass\"].value_counts()\n",
    "not_s = k_trn[k_trn[\"Survived\"] == 0][\"Pclass\"].value_counts()\n",
    "df_s_no = pd.DataFrame([srvvd, not_s])\n",
    "df_s_no.index = ['Survived', 'Did Not']\n",
    "ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to help understand the above, let's look at the numbers\n",
    "pd.pivot_table(k_trn, index='Survived', columns='Pclass', values='Ticket', aggfunc='count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn.groupby(['Pclass', 'Sex']).Survived.mean().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's also look at Pclass vs Age\n",
    "# yes another function needed\n",
    "fig, ax = plt.subplots(figsize=(15,5))\n",
    "ax.grid(True)\n",
    "p_ttl = ax.set_title(\"Survival by Age and Pclass\")\n",
    "p_xtk = plt.xticks(list(range(0,100,2)))\n",
    "p_p2 = sns.swarmplot(y=\"Age\", x=\"Pclass\", data=k_trn, hue=\"Survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# how about point of embarkation\n",
    "k_trn['Embarked'].value_counts().to_frame()\n",
    "pd.pivot_table(k_trn, index=\"Survived\", columns=\"Embarked\", values=\"Ticket\", aggfunc=\"count\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# why high survival rate at Cherbourg and lowest at Southampton\n",
    "# due to numbers of first class and/or third class passengers embarking at those points?\n",
    "#k_trn.groupby(['Embarked', 'Pclass']).Survived.sum().to_frame()\n",
    "def count_zeros(s):\n",
    "  return s.size - s.sum()\n",
    "k_trn.groupby(['Embarked', 'Pclass']).agg(\n",
    "  Survived=pd.NamedAgg(column='Survived', aggfunc='sum'),\n",
    "  DidNot=pd.NamedAgg(column='Survived', aggfunc=count_zeros)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# did gender balance make difference in survival rates Queenstown vs Southampton\n",
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "p_ttl = ax.set_title(\"Count by Embarcation Point and Class\")\n",
    "p_p = sns.countplot(x=\"Embarked\", data=k_trn, ax=ax, hue=\"Sex\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}