{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer\n", "from sklearn.pipeline import FeatureUnion, make_pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "oma_trn_3 = \"./data/titanic/oma_trn_3.csv\"\n", "oma_tst_3 = \"./data/titanic/oma_tst_3.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(oma_trn_3)\n", "k_tst = pd.read_csv(oma_tst_3)\n", "k_all = k_trn\n", "k_all = pd.concat([k_all, k_tst], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# will use iterativeimputer in pipeline to fill in missing ages\n", "# may also try KNNImputer in future\n", "transformer = FeatureUnion(\n", " transformer_list=[\n", " ('features', IterativeImputer(max_iter=10, random_state=0)),\n", " ('indicators', MissingIndicator())])\n", "clf = make_pipeline(transformer, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_trn = k_trn['Survived']\n", "\n", "features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']\n", "X_trn = pd.get_dummies(k_trn[features])\n", "X_test = pd.get_dummies(k_tst[features])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trn_cols = X_trn.columns.tolist()\n", "trn_cols.append(\"AgeMissing\")\n", "tst_cols = X_test.columns.tolist()\n", "tst_cols.append(\"AgeMissing\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf = clf.fit(X_trn, y_trn)\n", "preds = clf.predict(X_test)\n", "accuracy_score(k_tst[\"Survived\"], preds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# want to have a look at the intermediate/transformed data for training and prediction\n", "X_trn.tail()\n", "x_intermediate = X_trn\n", "for step in clf.steps[:-1]:\n", " x_intermediate = step[1].transform(x_intermediate)\n", " x_int_trn_trans = pd.DataFrame(x_intermediate, columns=trn_cols)\n", " x_int_trn_trans.tail()\n", "x_tst_int = X_test\n", "for step in clf.steps[:-1]:\n", " x_int = step[1].transform(x_tst_int)\n", " x_int_tst_trans = pd.DataFrame(x_int, columns=tst_cols)\n", " x_int_tst_trans.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's check the pipeline result\n", "trn_cols = X_trn.columns.tolist()\n", "trn_cols.append(\"AgeMissing\")\n", "X_trn_trans = transformer.fit_transform(X_trn, y_trn)\n", "X_trn_trans = pd.DataFrame(X_trn_trans, columns=trn_cols)\n", "tst_cols = X_test.columns.tolist()\n", "tst_cols.append(\"AgeMissing\")\n", "X_tst_trans = transformer.transform(X_test)\n", "X_tst_trans = pd.DataFrame(X_tst_trans, columns=tst_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n", "model.fit(X_trn_trans, y_trn)\n", "predictions = model.predict(X_tst_trans)\n", "accuracy_score(k_tst[\"Survived\"], predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's try the KNNimputer\n", "transformer_2 = FeatureUnion(\n", " transformer_list=[\n", " ('features', KNNImputer(n_neighbors=5)),\n", " ('indicators', MissingIndicator())])\n", "clf_2 = make_pipeline(transformer_2, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf_2 = clf_2.fit(X_trn, y_trn)\n", "preds_2 = clf_2.predict(X_test)\n", "accuracy_score(k_tst[\"Survived\"], preds_2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's try a few values for n_neighbors0\n", "for kn in [2, 4, 6, 8, 10, 12]:\n", " transformer_3 = FeatureUnion(\n", " transformer_list=[\n", " ('features', KNNImputer(n_neighbors=kn)),\n", " ('indicators', MissingIndicator())])\n", " clf_3 = make_pipeline(transformer_3, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))\n", " clf_3 = clf_3.fit(X_trn, y_trn)\n", " preds_3 = clf_3.predict(X_test)\n", " score = accuracy_score(k_tst[\"Survived\"], preds_3)\n", " print(f\"k_neighbors={kn} -> {score}\")" ] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }