{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "from sklearn import datasets\n", "from sklearn import preprocessing\n", "from sklearn import linear_model\n", "from sklearn import model_selection as ms\n", "from sklearn.svm import SVC\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load data\n", "c_data = datasets.load_breast_cancer(as_frame=True)\n", "y = c_data.target # Training labels ('malignant = 0', 'benign = 1')\n", "X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's split into training and test datasets\n", "X_trn, X_tst, y_trn, y_tst = ms.train_test_split(X, y, test_size=.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's instantiate our standard scalar, and scale the training and test datasets\n", "std_scl = preprocessing.StandardScaler()\n", "X_trn_std = std_scl.fit_transform(X_trn)\n", "X_tst_std = std_scl.transform(X_tst)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's see how long a single cross val, k=5, takes\n", "svm = SVC(C=0.5, random_state=0)\n", "svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n", "print(svm_scr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's start with the C hyperparameter for svm.SVC\n", "cv_scores_1 = {}\n", "reg_cs_1 = [i/1000 for i in range(1, 10, 2)]\n", "reg_cs_2 = [i/1000 for i in range(10, 100, 20)]\n", "reg_cs_3 = [i/1000 for i in range(100, 1000, 200)]\n", "reg_cs_4 = [i/1000 for i in range(1000, 10000, 2000)]\n", "#print(reg_cs_1, \"\\n\", reg_cs_2, \"\\n\", reg_cs_3, \"\\n\", reg_cs_4)\n", "reg_cs = np.concatenate([reg_cs_1, reg_cs_2, reg_cs_3, reg_cs_4])\n", "#print(reg_cs)\n", "\n", "for cr in reg_cs:\n", " svm = SVC(C=cr, random_state=0)\n", " svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n", " cv_scores_1[str(cr)] = svm_scr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for lbl, scrs in cv_scores_1.items():\n", " print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# looks like it just keeps getting better, let's try a few more values, 10 and up\n", "reg_cs_5 = [i for i in range(10, 20, 2)]\n", "reg_cs_2 = np.concatenate([reg_cs, reg_cs_5])\n", "cv_scores_2 = {}\n", "\n", "for cr in reg_cs_2:\n", " svm = SVC(C=cr, random_state=0)\n", " svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n", " cv_scores_2[str(cr)] = svm_scr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for lbl, scrs in cv_scores_2.items():\n", " print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sv_means = [scrs.mean() for _, scrs in cv_scores_2.items()]\n", "ax = plt.axes()\n", "ax.plot(reg_cs_2, sv_means, color='k', linestyle='-')\n", "ax.scatter(reg_cs_2, sv_means, color='blue', linestyle='-')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# looks like C>=7 and C<=9 gives us the best results\n", "# C=1 is the default, so let's test using C=1 & C=7\n", "c_tst = [1.0, 7.0]\n", "ct_scr = {}\n", "for ct in c_tst:\n", " svm = SVC(C=ct, random_state=0)\n", " t_f = svm.fit(X_trn_std, y_trn)\n", " t_preds = svm.predict(X_tst_std)\n", " ct_scr[str(ct)] = accuracy_score(y_tst, t_preds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for lbl, scr in ct_scr.items():\n", " print(f\"{lbl}: {scr}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks like our tuning attempt failed. More research needed. But it is possible our train/test split could be an issue.\n", "\n", "Okay, let's do the same for `SGDClassifier`. We will start with the regularization parameter, `alpha`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cv_scores_3 = {}\n", "reg_alphas = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]\n", "reg_alphas = 10.0**-np.arange(1,7)\n", "\n", "for alpha in reg_alphas:\n", " sgd = linear_model.SGDClassifier(alpha=alpha, max_iter=1000, random_state=0)\n", " sgd_scr = ms.cross_val_score(sgd, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n", " cv_scores_3[str(alpha)] = sgd_scr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for lbl, scrs in cv_scores_3.items():\n", " print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gd_means = [scrs.mean() for _, scrs in cv_scores_3.items()]\n", "ax = plt.axes()\n", "ax.plot(reg_alphas, gd_means, color='k', linestyle='-')\n", "ax.scatter(reg_alphas, gd_means, color='blue', linestyle='-')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ok, let's fit the training dataset and measure performance against the test dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sgd = linear_model.SGDClassifier(alpha=0.010, max_iter=1000, random_state=0)\n", "sg_f = sgd.fit(X_trn_std, y_trn)\n", "sg_preds = sgd.predict(X_tst_std)\n", "print(f'SGD, alpha=0.010 -> {accuracy_score(y_tst, sg_preds)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "That score pretty much the same as the **SVM** model with `C=1`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let's try one of sklearn's tuning methods\n", "from scipy.stats import uniform as sp_rand\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "param_grid = {'alpha': sp_rand()}\n", "sgd = linear_model.SGDClassifier(max_iter=1000, random_state=0)\n", "\n", "rsearch = RandomizedSearchCV(estimator=sgd, param_distributions=param_grid, n_iter=100)\n", "rsearch.fit(X_tst_std, y_tst)\n", "rs_tbl = pd.DataFrame(rsearch.cv_results_)\n", "display(rs_tbl)\n", "# summarize the results of the random parameter search\n", "print(rsearch.best_score_)\n", "print(rsearch.best_estimator_.alpha)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# And test the suggested value\n", "sgd_sr = linear_model.SGDClassifier(alpha=rsearch.best_estimator_.alpha, max_iter=1000, random_state=0)\n", "sgsr_f = sgd.fit(X_trn_std, y_trn)\n", "sgsr_preds = sgd.predict(X_tst_std)\n", "print(f'SGD, alpha={rsearch.best_estimator_.alpha:.4f} -> {accuracy_score(y_tst, sg_preds)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pretty much in line with the number we got using regular cross-validation on a selection of potential values.\n", "\n", "sklearn.model_selection.RandomizedSearchCV may have taken a little longer to run, but it looks to have tested a lot more values." ] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }