{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer, MissingIndicator\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer\n", "from sklearn.pipeline import FeatureUnion, make_pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "oma_trn_3 = \"./data/titanic/oma_trn_3.csv\"\n", "oma_tst_3 = \"./data/titanic/oma_tst_3.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(oma_trn_3)\n", "k_tst = pd.read_csv(oma_tst_3)\n", "k_all = k_trn\n", "k_all = pd.concat([k_all, k_tst], ignore_index=True)\n", "\n", "# skip work done previously to impute missing ages and save new CSV\n", "# execute cell or not\n", "do_cell = False" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# will use iterativeimputer in pipeline to fill in missing ages\n", "# may also try KNNImputer in future\n", "transformer = FeatureUnion(\n", " transformer_list=[\n", " ('features', IterativeImputer(max_iter=10, random_state=0)),\n", " ('indicators', MissingIndicator())])\n", "clf = make_pipeline(transformer, RandomForestClassifier())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_trn = k_trn['Survived']\n", "\n", "features = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age']\n", "X_trn = pd.get_dummies(k_trn[features])\n", "X_tst = pd.get_dummies(k_tst[features])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " trn_cols = X_trn.columns.tolist()\n", " trn_cols.append(\"AgeMissing\")\n", " X_trn_trans = transformer.fit_transform(X_trn, y_trn)\n", " X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)\n", " tst_cols = X_tst.columns.tolist()\n", " tst_cols.append(\"AgeMissing\")\n", " X_tst_trans = transformer.transform(X_tst)\n", " X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " X_trn_trans.tail()\n", " X_tst_trans.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " k_trn.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " print(X_trn_trans.iloc[0].PassengerId == k_trn.iloc[0].PassengerId)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " k_trn_2 = k_trn.copy()\n", " k_trn_2[:].Age = X_trn_trans[:].Age\n", " k_trn_2 = pd.concat([k_trn_2, X_trn_trans[:].AgeMissing], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " k_tst_2 = k_tst.copy()\n", " k_tst_2[:].Age = X_tst_trans[:].Age\n", " k_tst_2 = pd.concat([k_tst_2, X_tst_trans[:].AgeMissing], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " k_trn_2.to_csv(oma_trn_3, index=False)\n", " k_tst_2.to_csv(oma_tst_3, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if do_cell:\n", " # reload the updated datasets and see if any missing Age data\n", " k_trn = pd.read_csv(oma_trn_3)\n", " k_tst = pd.read_csv(oma_tst_3)\n", " k_trn.info()\n", " k_tst.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Discovered a bit of problem when working with 'Age' when working on a future post/notebook. There were negative ages?\n", "\n", "Apparently something that is possible with the IterativeImputer.\n", "\n", "Let's have a look." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn['Age'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sure enough. Should have done that when I was originally working on this post/notebook. Would have saved myself some grief.\n", "\n", "Was just going to use a backup of the CSV files and redo the post/notebook with hopefully the correct result. But, decided to fix without using the back up. Just the current CSV and the original Kaggle CSVs. Expect it might get messy." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# start fresh\n", "features = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Age']\n", "X_trn = pd.get_dummies(k_trn[features])\n", "X_tst = pd.get_dummies(k_tst[features])\n", "# let's load the Kaggle datasets so we can get the original Age data\n", "kg_trn = pd.read_csv(kaggle_trn)\n", "kg_tst = pd.read_csv(kaggle_tst)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now let's replace the Age data in my versions of the datasets with that from the Kaggle datasets\n", "# for test will add kaggle column as well\n", "X_trn.rename(columns={\"Age\": \"iiAge\"}, inplace=True)\n", "X_trn[\"Age\"] = kg_trn[\"Age\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_trn.head()\n", "X_trn.tail()\n", "X_trn[\"Age\"].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's things out before deleting the old column\n", "X_trn.loc[(X_trn['iiAge'].ne(X_trn['Age'])) & (X_trn['Age'].notna())]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get rid of iiAge column in X_trn\n", "X_trn.drop('iiAge', axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's do same for test data set\n", "X_tst.rename(columns={\"Age\": \"iiAge\"}, inplace=True)\n", "X_tst[\"Age\"] = kg_tst[\"Age\"]\n", "X_tst.loc[(X_tst['iiAge'].ne(X_tst['Age'])) & (X_tst['Age'].notna())]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_tst.drop('iiAge', axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now see if we can fix that imputer\n", "min_age = min(X_trn[\"Age\"].min(), X_tst[\"Age\"].min())\n", "max_age = max(X_trn[\"Age\"].max(), X_tst[\"Age\"].max())\n", "print(min_age, max_age)\n", "transformer = FeatureUnion(\n", " transformer_list=[\n", " ('features', IterativeImputer(max_iter=10, min_value=min_age, max_value=max_age, random_state=0)),\n", " ('indicators', MissingIndicator())])\n", "clf = make_pipeline(transformer, RandomForestClassifier())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's train, and transform, our imputer on X_trn, and have look\n", "trn_cols = X_trn.columns.tolist()\n", "trn_cols.append(\"AgeMissing\")\n", "X_trn_trans = transformer.fit_transform(X_trn, y_trn)\n", "X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_trn_trans.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# looks better, do the same for X_tst\n", "tst_cols = X_tst.columns.tolist()\n", "tst_cols.append(\"AgeMissing\")\n", "X_tst_trans = transformer.transform(X_tst)\n", "X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_tst_trans.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# new updated training dataset dataframe\n", "k_trn_2 = k_trn.copy()\n", "k_trn_2 = k_trn_2.drop(\"AgeMissing\", axis=1)\n", "k_trn_2[:].Age = X_trn_trans[:].Age\n", "# k_trn_2[:].AgeMissing = X_trn_trans[:].AgeMissing\n", "k_trn_2 = pd.concat([k_trn_2, X_trn_trans[:].AgeMissing], axis=1)\n", "# new updated testing dataset dataframe\n", "k_tst_2 = k_tst.copy()\n", "k_tst_2 = k_tst_2.drop(\"AgeMissing\", axis=1)\n", "k_tst_2[:].Age = X_tst_trans[:].Age\n", "# k_tst_2[:].AgeMissing = X_tst_trans[:].AgeMissing\n", "k_tst_2 = pd.concat([k_tst_2, X_tst_trans[:].AgeMissing], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn_2.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_tst_2.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_trn_2.to_csv(oma_trn_3, index=False)\n", "k_tst_2.to_csv(oma_tst_3, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }