{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import math\n", "import re\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", " \n", "# Feature and Model Selection:\n", "import lightgbm as lgb\n", "from sklearn.svm import LinearSVC\n", "from sklearn.linear_model import LogisticRegression, SGDClassifier # linear classifiers\n", "from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier\n", "from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics\n", "from sklearn.impute import SimpleImputer, MissingIndicator\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer\n", "from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler\n", "from sklearn.compose import ColumnTransformer, make_column_transformer\n", "from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline\n", "\n", "# My own packages/modules\n", "import rek_ml.dsets_t as dst\n", "import rek_ml.ml_misc as rml" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets currently of interest\n", "k_trn = pd.read_csv(dst.oma_trn_5, dtype=dst.d_types_5)\n", "y_trn = k_trn[\"Survived\"]\n", "_ = k_trn.drop(\"Survived\", axis=1, inplace=True)\n", "# _ = k_trn.drop(\"Cabin\", axis=1, inplace=True)\n", "\n", "k_tst = pd.read_csv(dst.oma_tst_5, dtype=dst.d_types_5)\n", "y_tst = k_tst[\"Survived\"]\n", "_ = k_tst.drop(\"Survived\", axis=1, inplace=True)\n", "# _ = k_tst.drop(\"Cabin\", axis=1, inplace=True)\n", "\n", "k_all = k_trn\n", "k_all = pd.concat([k_all, k_tst], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's try the full feature set\n", "X_trn = k_trn[dst.full_trn_features]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_trn.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf_preds = {}\n", "for ds_nm, ds_cl in dst.rdc_ds.items():\n", " X_dtrn = k_trn[ds_cl]\n", " X_dtst = k_tst[ds_cl]\n", " tmp_pred = rml.run_classifiers_test(rml.classifier_list, X_dtrn, y_trn, X_dtst, y_tst)\n", " clf_preds[ds_nm] = list(tmp_pred.values())\n", "print(clf_preds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tst_acc = {\n", " 'PCA(12)': [0.7751196172248804, 0.7751196172248804, 0.7559808612440191, 0.7416267942583732, 0.6961722488038278, 0.7464114832535885]\n", "}\n", "\n", "clf_preds['PCA(12)'] = tst_acc['PCA(12)']\n", "df_cols = ['LogRegression', 'SGD', 'GBoosting', 'RandomForest', 'DecisionTree', 'ExtraTrees']\n", "df_preds = pd.DataFrame.from_dict(clf_preds, orient='index', columns=df_cols)\n", "pd.options.display.float_format = '{:,.5f}'.format\n", "df_preds" ] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }