{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n", "oma_trn = \"./data/titanic/oma_trn.csv\"\n", "oma_tst = \"./data/titanic/oma_tst.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(kaggle_trn)\n", "k_tst = pd.read_csv(rek_k_tst2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# a wee reminder\n", "k_trn.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's check test dataset for missing values\n", "k_tst.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ok let's first sort out missing embarked values\n", "display(k_trn.loc[k_trn['Embarked'].isnull()])\n", "# searching suitable sources, turns out both embarked at Southhampton\n", "# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html\n", "# https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html\n", "k_trn.loc[k_trn['Embarked'].isnull(), 'Embarked'] = 'S'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's deal with that\n", "display(k_tst.loc[k_tst['Fare'].isnull()])\n", "# will use avg 3 class fare for passenger without any SibSp or Parch,\n", "# not quite correct as we are not ignoring fares for multipassenger tickets\n", "avg_fare_3 = k_trn[((k_trn['Pclass'] == 3) & \n", " (k_trn['SibSp'] == 0) & (k_trn['Parch'] == 0))][['Fare']].mean().values[0]\n", "print(avg_fare_3)\n", "mask = k_tst['PassengerId'] == 1044 \n", "k_tst.loc[mask, 'Fare'] = avg_fare_3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# I want to save these two dataset to CSVs so don't have to redo these few fixes\n", "# Not going to currently save imputed ages, as may use different methods to impute missing ages\n", "k_trn.to_csv(oma_trn, index=False)\n", "dTN_tst = k_tst.drop(columns=\"TName\")\n", "dTN_tst.to_csv(oma_tst, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's sort the missing age values. There are a few options. So, I am currently thinking of trying a couple of them and seeing how the model used earlier performs with the differing value estimates.\n", "\n", "But first, let's write a function to do the testing on our various approaches to modifying the datasets for the missing age values." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def trn_tst(X_trn, y_trn, X_tst):\n", " model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n", " model.fit(X_trn, y_trn)\n", " predictions = model.predict(X_tst)\n", " # print(predictions[:5])\n", " # display(ds_tst[\"Survived\"].head())\n", " # return accuracy_score(ds_tst[\"Survived\"], predictions)\n", " return predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# start by replacing NaN with average passenger age (base case?)\n", "r_feats = ['Pclass','Sex','SibSp','Parch','Age','Embarked','Fare']\n", "d_feats = ['Pclass','SibSp','Parch','Age','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']\n", "y_trn = k_trn['Survived']\n", "Xd_age = pd.get_dummies(k_trn[r_feats])\n", "Xd_t_age = pd.get_dummies(k_tst[r_feats])\n", "#print(Xd_age.iloc[:2, :])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "imputer = SimpleImputer(missing_values=np.nan)\n", "X_age = imputer.fit_transform(Xd_age)\n", "X_age = pd.DataFrame(X_age, columns=d_feats)\n", "# X_age.info()\n", "X_t_age = imputer.transform(Xd_t_age)\n", "X_t_age = pd.DataFrame(X_t_age, columns=d_feats)\n", "# X_t_age.info()\n", "#print(X_age.columns, \"\\n\", X_t_age.columns)\n", "t_age_pred = trn_tst(X_age, y_trn, X_t_age)\n", "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "imputer_2 = IterativeImputer(missing_values=np.nan, max_iter=25)\n", "X_age_2 = imputer_2.fit_transform(Xd_age)\n", "X_age_2 = pd.DataFrame(X_age, columns=d_feats)\n", "X_t_age_2 = imputer_2.transform(Xd_t_age)\n", "X_t_age_2 = pd.DataFrame(X_t_age, columns=d_feats)\n", "#print(X_age.columns, X_t_age.columns)\n", "t_age_pred_2 = trn_tst(X_age_2, y_trn, X_t_age_2)\n", "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_2)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Surprised Simple and Iterative both produced the same result. May have to dig deeper." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#X_age_2['Age'].head()\n", "imps = pd.DataFrame(X_age_2['Age'].to_list(), columns=['Iterative'])\n", "#imps.head()\n", "imps2 = pd.concat([imps, X_age['Age']], axis=1)\n", "# imps2.groupby(['Iterative', 'Age']).ngroups\n", "imps2[(imps2['Iterative'] == imps2['Age'])].count()\n", "imps2[(imps2['Iterative'] != imps2['Age'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's look a imputing with our own code and approach." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.set(font_scale=1.5)\n", "feature_list = ['Pclass', 'Age','Sex_female','Sex_male', 'SibSp', 'Parch']\n", "hm = plt.figure(figsize=(10,6))\n", "g = sns.heatmap(Xd_age[feature_list].corr(), square=True, annot=True, cmap='coolwarm', fmt='.2f')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.style.use('seaborn-whitegrid')\n", "\n", "fig = plt.figure(figsize=(14, 5))\n", "top = fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.5)\n", "ax = fig.add_subplot(1, 3, 1)\n", "ax = sns.boxplot(x='Pclass', y='Age', data=k_trn)\n", "ax = fig.add_subplot(1, 3, 2)\n", "ax = sns.boxplot(x='Parch', y='Age', data=k_trn)\n", "ax = fig.add_subplot(1, 3, 3)\n", "ax = sns.boxplot(x='SibSp', y='Age', data=k_trn)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "median_ages = k_trn.groupby(['Pclass', 'Parch', 'SibSp'], as_index=False).Age.median()\n", "# median_ages.head()\n", "pc = 3\n", "pa = 2\n", "ss = 3\n", "median_ages[(median_ages['Pclass']==pc) & (median_ages['Parch']==pa) & (median_ages['SibSp']==ss)].Age.item()\n", "print(f\"Number of missing Age values: {median_ages['Age'].isnull().sum()}\")\n", "median_ages[(median_ages['Pclass']==3) & (median_ages['Parch']==2) & (median_ages['SibSp']==8)].Age.item()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn[(k_trn['Pclass']==3) & (k_trn['Parch']==2) & (k_trn['SibSp']==8)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_age_3 = Xd_age\n", "X_t_age_3 = Xd_t_age\n", "miss_trn = list(X_age_3['Age'][X_age_3['Age'].isnull()].index)\n", "miss_tst = list(X_t_age_3['Age'][X_t_age_3['Age'].isnull()].index)\n", "\n", "# print(miss_age[:5])\n", "for i in miss_trn:\n", " age_est = median_ages[(median_ages['Pclass']==X_age_3.iloc[i]['Pclass'])\n", " & (median_ages['Parch']==X_age_3.iloc[i]['Parch'])\n", " & (median_ages['SibSp']==X_age_3.iloc[i]['SibSp'])].Age.item()\n", " if pd.isnull(age_est):\n", " # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n", " X_age_3.loc[i, 'Age'] = 25\n", " else:\n", " X_age_3.loc[i, 'Age'] = math.ceil(age_est)\n", "\n", "for i in miss_tst:\n", " # print(f\"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}\")\n", " if not median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])\n", " & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])\n", " & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].empty:\n", "\n", " age_est = median_ages[(median_ages['Pclass']==X_t_age_3.iloc[i]['Pclass'])\n", " & (median_ages['Parch']==X_t_age_3.iloc[i]['Parch'])\n", " & (median_ages['SibSp']==X_t_age_3.iloc[i]['SibSp'])].Age.item()\n", " if pd.isnull(age_est):\n", " # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n", " X_t_age_3.loc[i, 'Age'] = 15\n", " else:\n", " X_t_age_3.loc[i, 'Age'] = math.ceil(age_est)\n", " else:\n", " X_t_age_3.loc[i, 'Age'] = 15\n", "\n", "print(f\"Number of missing Age training values: {X_age_3['Age'].isnull().sum()}\")\n", "print(f\"Number of missing Age test values: {X_t_age_3['Age'].isnull().sum()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t_age_pred_3 = trn_tst(X_age_3, y_trn, X_t_age_3)\n", "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_3)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "median_ages_2 = k_trn.groupby(['Pclass', 'SibSp'], as_index=False).Age.median()\n", "# median_ages.head()\n", "pc = 3\n", "ss = 3\n", "median_ages_2[(median_ages_2['Pclass']==pc) & (median_ages_2['SibSp']==ss)].Age.item()\n", "print(f\"Number of missing Age values: {median_ages_2['Age'].isnull().sum()}\")\n", "median_ages_2[(median_ages_2['Pclass']==3) & (median_ages_2['SibSp']==8)].Age.item()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_age_4 = Xd_age\n", "X_t_age_4 = Xd_t_age\n", "miss_trn = list(X_age_4['Age'][X_age_4['Age'].isnull()].index)\n", "miss_tst = list(X_t_age_4['Age'][X_t_age_4['Age'].isnull()].index)\n", "\n", "# print(miss_age[:5])\n", "for i in miss_trn:\n", " age_est = median_ages_2[(median_ages_2['Pclass']==X_age_4.iloc[i]['Pclass'])\n", " & (median_ages_2['Parch']==X_age_4.iloc[i]['Parch'])\n", " & (median_ages_2['SibSp']==X_age_4.iloc[i]['SibSp'])].Age.item()\n", " if pd.isnull(age_est):\n", " # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n", " X_age_4.loc[i, 'Age'] = 25\n", " else:\n", " X_age_4.loc[i, 'Age'] = math.ceil(age_est)\n", "\n", "for i in miss_tst:\n", " # print(f\"! {X_t_age_3.iloc[i]['Pclass']}, {X_t_age_3.iloc[i]['Parch']}, {X_t_age_3.iloc[i]['SibSp']}\")\n", " if not median_ages_2[(median_ages_2['Pclass']==X_t_age_4.iloc[i]['Pclass'])\n", " & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])\n", " & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].empty:\n", "\n", " age_est = median_ages_2[(median_ages['Pclass']==X_t_age_4.iloc[i]['Pclass'])\n", " & (median_ages_2['Parch']==X_t_age_4.iloc[i]['Parch'])\n", " & (median_ages_2['SibSp']==X_t_age_4.iloc[i]['SibSp'])].Age.item()\n", " if pd.isnull(age_est):\n", " # print(f\"! {k_trn.iloc[i]['Pclass']}, {k_trn.iloc[i]['Parch']}, {k_trn.iloc[i]['SibSp']} -> {age_est}\")\n", " X_t_age_4.loc[i, 'Age'] = 15\n", " else:\n", " X_t_age_4.loc[i, 'Age'] = math.ceil(age_est)\n", " else:\n", " X_t_age_4.loc[i, 'Age'] = 15\n", "\n", "print(f\"Number of missing Age training values: {X_age_4['Age'].isnull().sum()}\")\n", "print(f\"Number of missing Age test values: {X_t_age_4['Age'].isnull().sum()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t_age_pred_4 = trn_tst(X_age_4, y_trn, X_t_age_4)\n", "print(f'model accuracy: {accuracy_score(k_tst[\"Survived\"], t_age_pred_4)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }