{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(kaggle_trn)\n", "k_tst = pd.read_csv(rek_k_tst2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(k_trn.columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# looks to be missing data in the training dataset, what about the test dataset\n", "#sns.color_palette(\"Set2\")\n", "fig, ax = plt.subplots(figsize=(5,5))\n", "o_ttl = plt.title(\"Training Dataset: Missing Values\")\n", "o_map = sns.heatmap(k_tst.isnull(), cbar=False, cmap=\"rocket_r\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# gender vs survival\n", "k_trn.groupby('Sex').Survived.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# or visually\n", "srvvd = k_trn[k_trn[\"Survived\"] == 1][\"Sex\"].value_counts()\n", "not_s = k_trn[k_trn[\"Survived\"] == 0][\"Sex\"].value_counts()\n", "df_s_no = pd.DataFrame([srvvd, not_s])\n", "df_s_no.index = ['Survived', 'Did Not']\n", "ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# age vs survival\n", "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Age Distribution\")\n", "p_plt = sns.histplot(k_trn[\"Age\"], ax=ax, bins=30, kde=True, color='b')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Age Distribution by Survival Status\")\n", "p_x = ax.set_xlabel(\"Age\")\n", "p_p1 = sns.kdeplot(k_trn[\"Age\"].loc[k_trn[\"Survived\"] == 1], ax=ax, label='Survived', shade=True)\n", "p_p2 = sns.kdeplot(k_trn[\"Age\"].loc[k_trn[\"Survived\"] == 0], ax=ax, label='Did Not', shade=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# not much info, try something else\n", "fig, ax = plt.subplots(figsize=(15,5))\n", "ax.grid(True)\n", "p_ttl = ax.set_title(\"Survival by Age and Gender\")\n", "p_xtk = plt.xticks(list(range(0,100,2)))\n", "p_p2 = sns.swarmplot(y=\"Sex\", x=\"Age\", data=k_trn, hue=\"Survived\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's checkout Pclass\n", "k_trn.groupby(['Pclass']).Survived.mean().to_frame()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# how about a chart\n", "# yes, I know, should have created a function\n", "srvvd = k_trn[k_trn[\"Survived\"] == 1][\"Pclass\"].value_counts()\n", "not_s = k_trn[k_trn[\"Survived\"] == 0][\"Pclass\"].value_counts()\n", "df_s_no = pd.DataFrame([srvvd, not_s])\n", "df_s_no.index = ['Survived', 'Did Not']\n", "ax = df_s_no.plot(kind='bar', stacked=True, figsize=(5,5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# to help understand the above, let's look at the numbers\n", "pd.pivot_table(k_trn, index='Survived', columns='Pclass', values='Ticket', aggfunc='count')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn.groupby(['Pclass', 'Sex']).Survived.mean().to_frame()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's also look at Pclass vs Age\n", "# yes another function needed\n", "fig, ax = plt.subplots(figsize=(15,5))\n", "ax.grid(True)\n", "p_ttl = ax.set_title(\"Survival by Age and Pclass\")\n", "p_xtk = plt.xticks(list(range(0,100,2)))\n", "p_p2 = sns.swarmplot(y=\"Age\", x=\"Pclass\", data=k_trn, hue=\"Survived\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# how about point of embarkation\n", "k_trn['Embarked'].value_counts().to_frame()\n", "pd.pivot_table(k_trn, index=\"Survived\", columns=\"Embarked\", values=\"Ticket\", aggfunc=\"count\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# why high survival rate at Cherbourg and lowest at Southampton\n", "# due to numbers of first class and/or third class passengers embarking at those points?\n", "#k_trn.groupby(['Embarked', 'Pclass']).Survived.sum().to_frame()\n", "def count_zeros(s):\n", " return s.size - s.sum()\n", "k_trn.groupby(['Embarked', 'Pclass']).agg(\n", " Survived=pd.NamedAgg(column='Survived', aggfunc='sum'),\n", " DidNot=pd.NamedAgg(column='Survived', aggfunc=count_zeros)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# did gender balance make difference in survival rates Queenstown vs Southampton\n", "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Count by Embarcation Point and Class\")\n", "p_p = sns.countplot(x=\"Embarked\", data=k_trn, ax=ax, hue=\"Sex\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }