{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import IterativeImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n",
    "oma_trn = \"./data/titanic/oma_trn.csv\"\n",
    "oma_tst = \"./data/titanic/oma_tst.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets currently of interest\n",
    "k_trn = pd.read_csv(kaggle_trn)\n",
    "k_tst = pd.read_csv(rek_k_tst2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a wee reminder\n",
    "k_trn.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's check test dataset for missing values\n",
    "k_tst.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ok let's first sort out missing embarked values\n",
    "display(k_trn.loc[k_trn['Embarked'].isnull()])\n",
    "# searching suitable sources, turns out both embarked at Southhampton\n",
    "# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html\n",
    "# https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html\n",
    "k_trn.loc[k_trn['Embarked'].isnull(), 'Embarked'] = 'S'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's deal with that\n",
    "display(k_tst.loc[k_tst['Fare'].isnull()])\n",
    "# will use avg 3 class fare for passenger without any SibSp or Parch,\n",
    "# not quite correct as we are not ignoring fares for multipassenger tickets\n",
    "avg_fare_3 = k_trn[((k_trn['Pclass'] == 3) & \n",
    "                    (k_trn['SibSp'] == 0) & (k_trn['Parch'] == 0))][['Fare']].mean().values[0]\n",
    "print(avg_fare_3)\n",
    "mask = k_tst['PassengerId'] == 1044 \n",
    "k_tst.loc[mask, 'Fare'] = avg_fare_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I want to save these two dataset to CSVs so don't have to redo these few fixes\n",
    "# Not going to currently save imputed ages, as may use different methods to impute missing ages\n",
    "k_trn.to_csv(oma_trn, index=False)\n",
    "dTN_tst = k_tst.drop(columns=\"TName\")\n",
    "dTN_tst.to_csv(oma_tst, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, let's sort the missing age values. There are a few options. So, I am currently thinking of trying a couple of them and seeing how the model used earlier performs with the differing value estimates.\n",
    "\n",
    "But first, let's write a function to do the testing on our various approaches to modifying the datasets for the missing age values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trn_tst(X_trn, y_trn, X_tst):\n",
    "  model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
    "  model.fit(X_trn, y_trn)\n",
    "  predictions = model.predict(X_tst)\n",
    "  # print(predictions[:5])\n",
    "  # display(ds_tst[\"Survived\"].head())\n",
    "  # return accuracy_score(ds_tst[\"Survived\"], predictions)\n",
    "  return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# start by replacing NaN with average passenger age (base case?)\n",
    "r_feats = ['Pclass','Sex','SibSp','Parch','Age','Embarked','Fare']\n",
    "d_feats = ['Pclass','SibSp','Parch','Age','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']\n",
    "y_trn = k_trn['Survived']\n",
    "Xd_age = pd.get_dummies(k_trn[r_feats])\n",
    "Xd_t_age = pd.get_dummies(k_tst[r_feats])\n",
    "#print(Xd_age.iloc[:2, :])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imputer = SimpleImputer(missing_values=np.nan)\n",
    "X_age = imputer.fit_transform(Xd_age)\n",
    "X_age = pd.DataFrame(X_age, columns=d_feats)\n",
    "# X_age.info()\n",
    "X_t_age = imputer.transform(Xd_t_age)\n",
    "X_t_age = pd.DataFrame(X_t_age, columns=d_feats)\n",
    "# X_t_age.info()\n",
    "#print(X_age.columns, \"\\n\", X_t_age.columns)\n",
    "t_age_pred = trn_tst(X_age, y_trn, X_t_age)\n",
    "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imputer_2 = IterativeImputer(missing_values=np.nan, max_iter=25)\n",
    "X_age_2 = imputer_2.fit_transform(Xd_age)\n",
    "X_age_2 = pd.DataFrame(X_age, columns=d_feats)\n",
    "X_t_age_2 = imputer_2.transform(Xd_t_age)\n",
    "X_t_age_2 = pd.DataFrame(X_t_age, columns=d_feats)\n",
    "#print(X_age.columns, X_t_age.columns)\n",
    "t_age_pred_2 = trn_tst(X_age_2, y_trn, X_t_age_2)\n",
    "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_2)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Surprised Simple and Iterative both produced the same result. May have to dig deeper."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#X_age_2['Age'].head()\n",
    "imps = pd.DataFrame(X_age_2['Age'].to_list(), columns=['Iterative'])\n",
    "#imps.head()\n",
    "imps2 = pd.concat([imps, X_age['Age']], axis=1)\n",
    "# imps2.groupby(['Iterative', 'Age']).ngroups\n",
    "imps2[(imps2['Iterative'] == imps2['Age'])].count()\n",
    "imps2[(imps2['Iterative'] != imps2['Age'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, let's look a imputing with our own code and approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(font_scale=1.5)\n",
    "feature_list = ['Pclass', 'Age','Sex_female','Sex_male', 'SibSp', 'Parch']\n",
    "hm = plt.figure(figsize=(10,6))\n",
    "g = sns.heatmap(Xd_age[feature_list].corr(), square=True, annot=True, cmap='coolwarm', fmt='.2f')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.style.use('seaborn-whitegrid')\n",
    "\n",
    "fig = plt.figure(figsize=(14, 5))\n",
    "top = fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.5)\n",
    "ax = fig.add_subplot(1, 3, 1)\n",
    "ax = sns.boxplot(x='Pclass', y='Age', data=k_trn)\n",
    "ax = fig.add_subplot(1, 3, 2)\n",
    "ax = sns.boxplot(x='Parch', y='Age', data=k_trn)\n",
    "ax = fig.add_subplot(1, 3, 3)\n",
    "ax = sns.boxplot(x='SibSp', y='Age', data=k_trn)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "median_ages = k_trn.groupby(['Pclass', 'Parch', 'SibSp'], as_index=False).Age.median()\n",
    "# median_ages.head()\n",
    "pc = 3\n",
    "pa = 2\n",
    "ss = 3\n",
    "median_ages[(median_ages['Pclass']==pc) & (median_ages['Parch']==pa) & (median_ages['SibSp']==ss)].Age.item()\n",
    "print(f\"Number of missing Age values: {median_ages['Age'].isnull().sum()}\")\n",
    "median_ages[(median_ages['Pclass']==3) & (median_ages['Parch']==2) & (median_ages['SibSp']==8)].Age.item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn['Pclass']==3) & (k_trn['Parch']==2) & (k_trn['SibSp']==8)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_age_3 = Xd_age\n",
    "X_t_age_3 = Xd_t_age\n",
    "miss_trn = list(X_age_3['Age'][X_age_3['Age'].isnull()].index)\n",
    "miss_tst = list(X_t_age_3['Age'][X_t_age_3['Age'].isnull()].index)\n",
    "\n",
    "# print(miss_age[:5])\n",
    "for i in miss_trn:\n",
    "  age_est = median_ages[(median_ages['Pclass']==X_age_3.iloc[i]['Pclass'])\n",
    "    & (median_ages['Parch']==X_age_3.iloc[i]['Parch'])\n",
    "    & (median_ages['SibSp']==X_age_3.iloc[i]['SibSp'])].Age.item()\n",
    "  if pd.isnull(age_est):\n",
    "    # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n",
    "    X_age_3.loc[i, 'Age'] = 25\n",
    "  else:\n",
    "    X_age_3.loc[i, 'Age'] = math.ceil(age_est)\n",
    "\n",
    "for i in miss_tst:\n",
    "  # print(f\"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}\")\n",
    "  if not median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])\n",
    "    & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])\n",
    "    & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].empty:\n",
    "\n",
    "    age_est = median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])\n",
    "      & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])\n",
    "      & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].Age.item()\n",
    "    if pd.isnull(age_est):\n",
    "      # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n",
    "      X_t_age_3.loc[i, 'Age'] = 15\n",
    "    else:\n",
    "      X_t_age_3.loc[i, 'Age'] = math.ceil(age_est)\n",
    "  else:\n",
    "    X_t_age_3.loc[i, 'Age'] = 15\n",
    "\n",
    "print(f\"Number of missing Age training values: {X_age_3['Age'].isnull().sum()}\")\n",
    "print(f\"Number of missing Age test values: {X_t_age_3['Age'].isnull().sum()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t_age_pred_3 = trn_tst(X_age_3, y_trn, X_t_age_3)\n",
    "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_3)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "median_ages_2 = k_trn.groupby(['Pclass', 'SibSp'], as_index=False).Age.median()\n",
    "# median_ages.head()\n",
    "pc = 3\n",
    "ss = 3\n",
    "median_ages_2[(median_ages_2['Pclass']==pc) & (median_ages_2['SibSp']==ss)].Age.item()\n",
    "print(f\"Number of missing Age values: {median_ages_2['Age'].isnull().sum()}\")\n",
    "median_ages_2[(median_ages_2['Pclass']==3) & (median_ages_2['SibSp']==8)].Age.item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_age_4 = Xd_age\n",
    "X_t_age_4 = Xd_t_age\n",
    "miss_trn = list(X_age_4['Age'][X_age_4['Age'].isnull()].index)\n",
    "miss_tst = list(X_t_age_4['Age'][X_t_age_4['Age'].isnull()].index)\n",
    "\n",
    "# print(miss_age[:5])\n",
    "for i in miss_trn:\n",
    "  age_est = median_ages_2[(median_ages_2['Pclass']==X_age_4.iloc[i]['Pclass'])\n",
    "    & (median_ages_2['Parch']==X_age_4.iloc[i]['Parch'])\n",
    "    & (median_ages_2['SibSp']==X_age_4.iloc[i]['SibSp'])].Age.item()\n",
    "  if pd.isnull(age_est):\n",
    "    # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n",
    "    X_age_4.loc[i, 'Age'] = 25\n",
    "  else:\n",
    "    X_age_4.loc[i, 'Age'] = math.ceil(age_est)\n",
    "\n",
    "for i in miss_tst:\n",
    "  # print(f\"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}\")\n",
    "  if not median_ages_2[(median_ages_2['Pclass']==X_t_age_4.iloc[i]['Pclass'])\n",
    "    & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])\n",
    "    & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].empty:\n",
    "\n",
    "    age_est = median_ages_2[(median_ages['Pclass']==X_t_age_4.iloc[i]['Pclass'])\n",
    "      & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])\n",
    "      & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].Age.item()\n",
    "    if pd.isnull(age_est):\n",
    "      # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n",
    "      X_t_age_4.loc[i, 'Age'] = 15\n",
    "    else:\n",
    "      X_t_age_4.loc[i, 'Age'] = math.ceil(age_est)\n",
    "  else:\n",
    "    X_t_age_4.loc[i, 'Age'] = 15\n",
    "\n",
    "print(f\"Number of missing Age training values: {X_age_4['Age'].isnull().sum()}\")\n",
    "print(f\"Number of missing Age test values: {X_t_age_4['Age'].isnull().sum()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t_age_pred_4 = trn_tst(X_age_4, y_trn, X_t_age_4)\n",
    "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_4)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}