{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n", "oma_trn = \"./data/titanic/oma_trn.csv\"\n", "oma_tst = \"./data/titanic/oma_tst.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(oma_trn)\n", "k_tst = pd.read_csv(oma_tst)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn['Ticket'] == 'LINE')]\n", "k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701') | (k_tst['Ticket'] == '392095')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn['Fare'] == 0.0)]\n", "k_tst[(k_tst['Fare'] == 0.0)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Ticket\"].str.contains('2398'))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Ticket\"].str.contains('11205'))]\n", "k_tst[(k_tst[\"Ticket\"].str.contains('11205'))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Ticket\"].str.contains('1997'))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# that didn't do much, let's try '199'\n", "k_trn[(k_trn[\"Ticket\"].str.startswith('199')) & (k_trn[\"Pclass\"] == 1) & (k_trn[\"SibSp\"] == 0) & (k_trn[\"Parch\"] == 0)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's dig a little further into that group with tickets'11205..'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Cabin\"].str.startswith('B')) & (k_trn[\"Pclass\"] == 1) & (k_trn[\"SibSp\"] == 0) & (k_trn[\"Parch\"] == 0)].sort_values(['Ticket', 'Cabin'])\n", "k_tst[(k_tst[\"Cabin\"].str.startswith('B')) & (k_tst[\"Pclass\"] == 1) & (k_tst[\"SibSp\"] == 0) & (k_tst[\"Parch\"] == 0)].sort_values(['Ticket', 'Cabin'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(~k_trn[\"Cabin\"].isnull()) & (k_trn[\"Cabin\"].str.contains(' '))].sort_values(['Ticket','Cabin'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Pclass\"] == 1) & (k_trn[\"Fare\"] > 0)].sort_values(['Fare'])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn[\"Pclass\"] == 1) & (k_trn[\"Fare\"] > 0) & (k_trn[\"Cabin\"].str.startswith('B'))].sort_values(['Fare']).head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(~k_trn[\"Cabin\"].isnull()) & (k_trn[\"Cabin\"].str.contains('B9'))].sort_values(['Ticket','Cabin'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# based on discussion in related post, let's update the zero fares\n", "# start with the Storey group\n", "g1a = list(k_trn[(k_trn['Ticket'] == 'LINE')].index)\n", "g1b = list(k_tst[(k_tst['Ticket'] == 'LINE') | (k_tst['Ticket'] == '3701')].index)\n", "print(g1a, g1b)\n", "#print(k_tst.iloc[g1b[0]])\n", "nbr_p = len(g1a) + len(g1b)\n", "base_f = 7.25\n", "grp_f = base_f * nbr_p\n", "def_t = \"370160\"\n", "for i in g1a:\n", " k_trn.loc[i, \"Ticket\"] = def_t\n", " k_trn.loc[i, \"Fare\"] = grp_f\n", "for i in g1b:\n", " k_tst.loc[i, \"Ticket\"] = def_t\n", " k_tst.loc[i, \"Fare\"] = grp_f\n", "k_trn[k_trn[\"Ticket\"] == def_t]\n", "k_tst[k_tst[\"Ticket\"] == def_t]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now the group with tickets beginning with 2398\n", "g2 = list(k_trn[(k_trn[\"Ticket\"].str.contains('2398')) & (k_trn[\"Fare\"] == 0.0)].index)\n", "print(g2)\n", "base_f = 13.00\n", "g3_f = base_f * 3\n", "g3_t = '239853'\n", "for i in g2:\n", " if k_trn.iloc[i].Ticket == g3_t:\n", " k_trn.loc[i, \"Fare\"] = g3_f\n", " else:\n", " k_trn.loc[i, \"Fare\"] = base_f\n", "k_trn[(k_trn[\"Ticket\"].str.contains('2398'))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now Mr. Reuchlin’s case\n", "mask = k_trn['Name'].str.startswith(\"Reuchlin,\")\n", "k_trn.loc[mask, 'Fare'] = 30.5\n", "k_trn[mask]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and our last group including the head of the White Star Line\n", "boss = k_tst['Name'].str.startswith(\"Ismay,\")\n", "valet = k_trn['Name'].str.startswith(\"Fry,\")\n", "secretary = k_trn['Ticket'].str.startswith(\"112059\")\n", "others1 = list(k_trn[k_trn[\"Ticket\"].isin(['112052', '112050'])].index)\n", "others2 = list(k_tst[k_tst[\"Ticket\"] == \"112051\"].index)\n", "k_tst.loc[boss, \"Fare\"] = 512.33\n", "k_trn.loc[valet, \"Ticket\"] = \"112058A\"\n", "k_trn.loc[valet, \"Fare\"] = 30.00\n", "k_trn.loc[secretary, \"Fare\"] = 60.00\n", "for i in others1:\n", " k_trn.loc[i, \"Fare\"] = 30.00\n", "for i in others2:\n", " k_tst.loc[i, \"Fare\"] = 30.00\n", "k_trn[(k_trn[\"Ticket\"].str.contains('11205'))]\n", "k_tst[(k_tst[\"Ticket\"].str.contains('11205'))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's double check we have no more zero or nan fares\n", "k_trn[(k_trn[\"Fare\"].isnull()) | (k_trn[\"Fare\"] == 0.0)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay let's save the changes\n", "oma_trn_2 = \"./data/titanic/oma_trn_2.csv\"\n", "oma_tst_2 = \"./data/titanic/oma_tst_2.csv\"\n", "k_trn.to_csv(oma_trn_2, index=False)\n", "k_tst.to_csv(oma_tst_2, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's see if adding Fare to our basic set of features improves our score\n", "Y = k_trn['Survived']\n", "\n", "features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']\n", "X = pd.get_dummies(k_trn[features])\n", "X_test = pd.get_dummies(k_tst[features])\n", "model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n", "model.fit(X, Y)\n", "predictions = model.predict(X_test)\n", "accuracy_score(k_tst[\"Survived\"], predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }