{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.impute import SimpleImputer, MissingIndicator\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import IterativeImputer\n",
    "from sklearn.pipeline import FeatureUnion, make_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "oma_trn_3 = \"./data/titanic/oma_trn_3.csv\"\n",
    "oma_tst_3 = \"./data/titanic/oma_tst_3.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets currently of interest\n",
    "k_trn = pd.read_csv(oma_trn_3)\n",
    "k_tst = pd.read_csv(oma_tst_3)\n",
    "k_all = k_trn.copy()\n",
    "k_all = pd.concat([k_all, k_tst], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's bin the age data and have a look\n",
    "k_all['AgeRng'] = pd.cut(k_all['Age'], bins=range(0, 90, 5))\n",
    "sns.set(rc={'figure.figsize':(12,8)})\n",
    "sns.set(font_scale=1.0)\n",
    "# plt.style.use('seaborn-whitegrid')\n",
    "g = sns.barplot(x='AgeRng', y='Survived', data=k_all)\n",
    "\n",
    "table = pd.crosstab(k_all['AgeRng'], k_all['Survived'])\n",
    "print('\\n', table)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 0-15: survival pretty good\n",
    "- 30-40: looks to be a slight increase in survival rate\n",
    "- 60+: survival rate looks to decline\n",
    "\n",
    "So, looking at using following ranges: ['0-15', '16-29', '30-40', '41-59', '60+']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bin_thresholds = [0, 15, 30, 40, 59, 90]\n",
    "bin_labels = ['0-15', '16-29', '30-40', '41-59', '60+']\n",
    "k_trn['AgeBin'] = pd.cut(k_trn['Age'], bins=bin_thresholds, labels=bin_labels)\n",
    "k_tst['AgeBin'] = pd.cut(k_tst['Age'], bins=bin_thresholds, labels=bin_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_tst.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn.to_csv(oma_trn_3, index=False)\n",
    "k_tst.to_csv(oma_tst_3, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}