{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(kaggle_trn)\n", "k_tst = pd.read_csv(rek_k_tst2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# a wee reminder\n", "k_trn.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# did the fare paid influence the survival rate\n", "# let's look at the fare feature info\n", "k_trn.Fare.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Fare Distribution\")\n", "p_plt = sns.histplot(k_trn[\"Fare\"], ax=ax, bins=50, kde=True, color='b')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# categorize fares and plot against survival - let's just use the quartiles for now\n", "f_cats = ['Lowest', \"Medium\", \"High\", \"Highest\"]\n", "f_rngs = pd.qcut(k_trn[\"Fare\"], len(f_cats), labels=f_cats)\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", "p_ttl = ax.set_title(\"Survival vs Fare\")\n", "p_y = ax.set_ylabel(\"Survival Rate\")\n", "p_plt = sns.barplot(x=f_rngs, y=k_trn.Survived, ax=ax, ci=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's add gender to the equations\n", "fig, ax = plt.subplots(figsize=(20,8))\n", "ax.grid(True)\n", "p_ttl = ax.set_title(\"Survival by Fare and Gender\")\n", "p_xtk = plt.xticks(list(range(0,100,2)))\n", "p_p2 = sns.swarmplot(y=\"Fare\", x=\"Sex\", data=k_trn, hue=\"Survived\", size=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's have a look at SibSp and Parch features\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", "p_ttl = ax.set_title(\"Survival vs Siblings/Spouses Aboard\")\n", "p_y = ax.set_ylabel(\"Survival Rate\")\n", "p_plt = sns.barplot(x=\"SibSp\", y=\"Survived\", data=k_trn, ax=ax, ci=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 5))\n", "p_ttl = ax.set_title(\"Survival vs Parents/Children Aboard\")\n", "p_y = ax.set_ylabel(\"Survival Rate\")\n", "p_plt = sns.barplot(x=\"Parch\", y=\"Survived\", data=k_trn, ax=ax, ci=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Survival Count by Siblings/Spouses Onboard\")\n", "p_p = sns.countplot(x=\"SibSp\", data=k_trn, ax=ax, hue=\"Survived\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Survival Count by Parents/Children Onboard\")\n", "p_p = sns.countplot(x=\"Parch\", data=k_trn, ax=ax, hue=\"Survived\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# what if we combine SibSp and Parch to get \"family size\", adding one for the passenger themselves\n", "k_trn['FamilySize'] = k_trn['Parch'] + k_trn['SibSp'] + 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5,5))\n", "p_ttl = ax.set_title(\"Survival Count by Family Size Onboard\")\n", "p_p = sns.countplot(x=\"FamilySize\", data=k_trn, ax=ax, hue=\"Survived\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[\"Cabin\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[\"Ticket\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[\"Name\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }