{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "from sklearn import datasets\n",
    "from sklearn import preprocessing\n",
    "from sklearn import linear_model\n",
    "from sklearn import model_selection as ms\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data\n",
    "c_data = datasets.load_breast_cancer(as_frame=True)\n",
    "y = c_data.target # Training labels ('malignant = 0', 'benign = 1')\n",
    "X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's split into training and test datasets\n",
    "X_trn, X_tst, y_trn, y_tst = ms.train_test_split(X, y, test_size=.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's instantiate our standard scalar, and scale the training and test datasets\n",
    "std_scl = preprocessing.StandardScaler()\n",
    "X_trn_std = std_scl.fit_transform(X_trn)\n",
    "X_tst_std = std_scl.transform(X_tst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's see how long a single cross val, k=5, takes\n",
    "svm = SVC(C=0.5, random_state=0)\n",
    "svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n",
    "print(svm_scr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's start with the C hyperparameter for svm.SVC\n",
    "cv_scores_1 = {}\n",
    "reg_cs_1 = [i/1000 for i in range(1, 10, 2)]\n",
    "reg_cs_2 = [i/1000 for i in range(10, 100, 20)]\n",
    "reg_cs_3 = [i/1000 for i in range(100, 1000, 200)]\n",
    "reg_cs_4 = [i/1000 for i in range(1000, 10000, 2000)]\n",
    "#print(reg_cs_1, \"\\n\", reg_cs_2, \"\\n\", reg_cs_3, \"\\n\", reg_cs_4)\n",
    "reg_cs = np.concatenate([reg_cs_1, reg_cs_2, reg_cs_3, reg_cs_4])\n",
    "#print(reg_cs)\n",
    "\n",
    "for cr in reg_cs:\n",
    "  svm = SVC(C=cr, random_state=0)\n",
    "  svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n",
    "  cv_scores_1[str(cr)] = svm_scr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for lbl, scrs in cv_scores_1.items():\n",
    "  print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# looks like it just keeps getting better, let's try a few more values, 10 and up\n",
    "reg_cs_5 = [i for i in range(10, 20, 2)]\n",
    "reg_cs_2 = np.concatenate([reg_cs, reg_cs_5])\n",
    "cv_scores_2 = {}\n",
    "\n",
    "for cr in reg_cs_2:\n",
    "  svm = SVC(C=cr, random_state=0)\n",
    "  svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n",
    "  cv_scores_2[str(cr)] = svm_scr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for lbl, scrs in cv_scores_2.items():\n",
    "  print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_means = [scrs.mean() for _, scrs in cv_scores_2.items()]\n",
    "ax = plt.axes()\n",
    "ax.plot(reg_cs_2, sv_means, color='k', linestyle='-')\n",
    "ax.scatter(reg_cs_2, sv_means, color='blue', linestyle='-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# looks like C>=7 and C<=9 gives us the best results\n",
    "# C=1 is the default, so let's test using C=1 & C=7\n",
    "c_tst = [1.0, 7.0]\n",
    "ct_scr = {}\n",
    "for ct in c_tst:\n",
    "  svm = SVC(C=ct, random_state=0)\n",
    "  t_f = svm.fit(X_trn_std, y_trn)\n",
    "  t_preds = svm.predict(X_tst_std)\n",
    "  ct_scr[str(ct)] = accuracy_score(y_tst, t_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for lbl, scr in ct_scr.items():\n",
    "  print(f\"{lbl}: {scr}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looks like our tuning attempt failed. More research needed. But it is possible our train/test split could be an issue.\n",
    "\n",
    "Okay, let's do the same for `SGDClassifier`. We will start with the regularization parameter, `alpha`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_scores_3 = {}\n",
    "reg_alphas = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]\n",
    "reg_alphas = 10.0**-np.arange(1,7)\n",
    "\n",
    "for alpha in reg_alphas:\n",
    "  sgd = linear_model.SGDClassifier(alpha=alpha, max_iter=1000, random_state=0)\n",
    "  sgd_scr = ms.cross_val_score(sgd, X_trn_std, y_trn, cv=5, scoring=\"accuracy\")\n",
    "  cv_scores_3[str(alpha)] = sgd_scr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for lbl, scrs in cv_scores_3.items():\n",
    "  print(f\"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gd_means = [scrs.mean() for _, scrs in cv_scores_3.items()]\n",
    "ax = plt.axes()\n",
    "ax.plot(reg_alphas, gd_means, color='k', linestyle='-')\n",
    "ax.scatter(reg_alphas, gd_means, color='blue', linestyle='-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ok, let's fit the training dataset and measure performance against the test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sgd = linear_model.SGDClassifier(alpha=0.010, max_iter=1000, random_state=0)\n",
    "sg_f = sgd.fit(X_trn_std, y_trn)\n",
    "sg_preds = sgd.predict(X_tst_std)\n",
    "print(f'SGD, alpha=0.010 -> {accuracy_score(y_tst, sg_preds)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That score pretty much the same as the **SVM** model with `C=1`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's try one of sklearn's tuning methods\n",
    "from scipy.stats import uniform as sp_rand\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_grid = {'alpha': sp_rand()}\n",
    "sgd = linear_model.SGDClassifier(max_iter=1000, random_state=0)\n",
    "\n",
    "rsearch = RandomizedSearchCV(estimator=sgd, param_distributions=param_grid, n_iter=100)\n",
    "rsearch.fit(X_tst_std, y_tst)\n",
    "rs_tbl = pd.DataFrame(rsearch.cv_results_)\n",
    "display(rs_tbl)\n",
    "# summarize the results of the random parameter search\n",
    "print(rsearch.best_score_)\n",
    "print(rsearch.best_estimator_.alpha)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# And test the suggested value\n",
    "sgd_sr = linear_model.SGDClassifier(alpha=rsearch.best_estimator_.alpha, max_iter=1000, random_state=0)\n",
    "sgsr_f = sgd.fit(X_trn_std, y_trn)\n",
    "sgsr_preds = sgd.predict(X_tst_std)\n",
    "print(f'SGD, alpha={rsearch.best_estimator_.alpha:.4f} -> {accuracy_score(y_tst, sg_preds)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pretty much in line with the number we got using regular cross-validation on a selection of potential values.\n",
    "\n",
    "sklearn.model_selection.RandomizedSearchCV may have taken a little longer to run, but it looks to have tested a lot more values."
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}