{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import IterativeImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n",
    "oma_trn = \"./data/titanic/oma_trn.csv\"\n",
    "oma_tst = \"./data/titanic/oma_tst.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets currently of interest\n",
    "k_trn = pd.read_csv(oma_trn)\n",
    "k_tst = pd.read_csv(oma_tst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn['Ticket'] == 'LINE')]\n",
    "k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701') | (k_tst['Ticket'] == '392095')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn['Fare'] == 0.0)]\n",
    "k_tst[(k_tst['Fare'] == 0.0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Ticket\"].str.contains('2398'))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Ticket\"].str.contains('11205'))]\n",
    "k_tst[(k_tst[\"Ticket\"].str.contains('11205'))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Ticket\"].str.contains('1997'))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# that didn't do much, let's try '199'\n",
    "k_trn[(k_trn[\"Ticket\"].str.startswith('199')) & (k_trn[\"Pclass\"] == 1) & (k_trn[\"SibSp\"] == 0) & (k_trn[\"Parch\"] == 0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's dig a little further into that group with tickets'11205..'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Cabin\"].str.startswith('B')) & (k_trn[\"Pclass\"] == 1) & (k_trn[\"SibSp\"] == 0) & (k_trn[\"Parch\"] == 0)].sort_values(['Ticket', 'Cabin'])\n",
    "k_tst[(k_tst[\"Cabin\"].str.startswith('B')) & (k_tst[\"Pclass\"] == 1) & (k_tst[\"SibSp\"] == 0) & (k_tst[\"Parch\"] == 0)].sort_values(['Ticket', 'Cabin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(~k_trn[\"Cabin\"].isnull()) & (k_trn[\"Cabin\"].str.contains(' '))].sort_values(['Ticket','Cabin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Pclass\"] == 1) & (k_trn[\"Fare\"] > 0)].sort_values(['Fare'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(k_trn[\"Pclass\"] == 1) & (k_trn[\"Fare\"] > 0) & (k_trn[\"Cabin\"].str.startswith('B'))].sort_values(['Fare']).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_trn[(~k_trn[\"Cabin\"].isnull()) & (k_trn[\"Cabin\"].str.contains('B9'))].sort_values(['Ticket','Cabin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# based on discussion in related post, let's update the zero fares\n",
    "# start with the Storey group\n",
    "g1a = list(k_trn[(k_trn['Ticket'] == 'LINE')].index)\n",
    "g1b = list(k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701')].index)\n",
    "print(g1a, g1b)\n",
    "#print(k_tst.iloc[g1b[0]])\n",
    "nbr_p = len(g1a) + len(g1b)\n",
    "base_f = 7.25\n",
    "grp_f = base_f * nbr_p\n",
    "def_t = \"370160\"\n",
    "for i in g1a:\n",
    "  k_trn.loc[i, \"Ticket\"] = def_t\n",
    "  k_trn.loc[i, \"Fare\"] = grp_f\n",
    "for i in g1b:\n",
    "  k_tst.loc[i, \"Ticket\"] = def_t\n",
    "  k_tst.loc[i, \"Fare\"] = grp_f\n",
    "k_trn[k_trn[\"Ticket\"] == def_t]\n",
    "k_tst[k_tst[\"Ticket\"] == def_t]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now the group with tickets beginning with 2398\n",
    "g2 = list(k_trn[(k_trn[\"Ticket\"].str.contains('2398')) & (k_trn[\"Fare\"] == 0.0)].index)\n",
    "print(g2)\n",
    "base_f = 13.00\n",
    "g3_f = base_f * 3\n",
    "g3_t = '239853'\n",
    "for i in g2:\n",
    "  if k_trn.iloc[i].Ticket == g3_t:\n",
    "    k_trn.loc[i, \"Fare\"] = g3_f\n",
    "  else:\n",
    "    k_trn.loc[i, \"Fare\"] = base_f\n",
    "k_trn[(k_trn[\"Ticket\"].str.contains('2398'))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now Mr. Reuchlin’s case\n",
    "mask = k_trn['Name'].str.startswith(\"Reuchlin,\")\n",
    "k_trn.loc[mask, 'Fare'] = 30.5\n",
    "k_trn[mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# and our last group including the head of the White Star Line\n",
    "boss = k_tst['Name'].str.startswith(\"Ismay,\")\n",
    "valet = k_trn['Name'].str.startswith(\"Fry,\")\n",
    "secretary = k_trn['Ticket'].str.startswith(\"112059\")\n",
    "others1 = list(k_trn[k_trn[\"Ticket\"].isin(['112052', '112050'])].index)\n",
    "others2 = list(k_tst[k_tst[\"Ticket\"] == \"112051\"].index)\n",
    "k_tst.loc[boss, \"Fare\"] = 512.33\n",
    "k_trn.loc[valet, \"Ticket\"] = \"112058A\"\n",
    "k_trn.loc[valet, \"Fare\"] = 30.00\n",
    "k_trn.loc[secretary, \"Fare\"] = 60.00\n",
    "for i in others1:\n",
    "  k_trn.loc[i, \"Fare\"] = 30.00\n",
    "for i in others2:\n",
    "  k_tst.loc[i, \"Fare\"] = 30.00\n",
    "k_trn[(k_trn[\"Ticket\"].str.contains('11205'))]\n",
    "k_tst[(k_tst[\"Ticket\"].str.contains('11205'))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's double check we have no more zero or nan fares\n",
    "k_trn[(k_trn[\"Fare\"].isnull()) | (k_trn[\"Fare\"] == 0.0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay let's save the changes\n",
    "oma_trn_2 = \"./data/titanic/oma_trn_2.csv\"\n",
    "oma_tst_2 = \"./data/titanic/oma_tst_2.csv\"\n",
    "k_trn.to_csv(oma_trn_2, index=False)\n",
    "k_tst.to_csv(oma_tst_2, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's see if adding Fare to our basic set of features improves our score\n",
    "Y = k_trn['Survived']\n",
    "\n",
    "features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']\n",
    "X = pd.get_dummies(k_trn[features])\n",
    "X_test = pd.get_dummies(k_tst[features])\n",
    "model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
    "model.fit(X, Y)\n",
    "predictions = model.predict(X_test)\n",
    "accuracy_score(k_tst[\"Survived\"], predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}