{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\r\n",
    "import numpy as np\r\n",
    "import pandas as pd\r\n",
    "import matplotlib.pyplot as plt # for plotting \r\n",
    "import seaborn as sns # for plotting\r\n",
    "from sklearn import datasets\r\n",
    "from sklearn import preprocessing\r\n",
    "from sklearn import linear_model\r\n",
    "from sklearn import model_selection as ms\r\n",
    "from sklearn.svm import SVC\r\n",
    "from sklearn.metrics import accuracy_score"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "InteractiveShell.ast_node_interactivity = \"all\"\r\n",
    "%matplotlib inline\r\n",
    "plt.style.use('default')\r\n",
    "sns.set()\r\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "c_data = datasets.load_breast_cancer(as_frame=True)\r\n",
    "y = c_data.target # Training labels ('malignant = 0', 'benign = 1')\r\n",
    "X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "y.head()\r\n",
    "X.head()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# let's get a bit more info on the labels\r\n",
    "# note: 1 = benign, 0 = malignant\r\n",
    "target_cnt = y.value_counts()\r\n",
    "print(target_cnt)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# or we could have used a countplot\r\n",
    "# semi-colons weren't preventing output, so assigning to variables\r\n",
    "fig = plt.figure(figsize=(8,6))\r\n",
    "ax = sns.countplot(x=y, order=[0, 1])\r\n",
    "pt = plt.title('Distribution of Outcomes')\r\n",
    "px = plt.xlabel('Is Benign (1 = True)')\r\n",
    "py = plt.ylabel('Count')\r\n",
    "\r\n",
    "for p in ax.patches:\r\n",
    "      #pv = ax.annotate(f'{p.get_height()}', (p.get_x(), p.get_height()+2));\r\n",
    "      pv = ax.annotate(f'\\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=14)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# a deeper look at the attributes\r\n",
    "X.info()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# I am going to split the full dataset in two, keeping 69 rows for a final test case\r\n",
    "# since the data looks to be unordered, I am just going to pull out the last 1/6 of the samples\r\n",
    "y_trn = y[:470]\r\n",
    "y_tst = y[470:]\r\n",
    "X_trn = X[:470]\r\n",
    "X_tst = X[470:]\r\n",
    "print(len(y_trn), len(y_tst), len(X_trn), len(X_tst))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# let's start with a quick look at training/validating the perceptron algorithm on this data set\r\n",
    "alf = linear_model.Perceptron(tol=1e-3, random_state=0)\r\n",
    "# using 5-fold, as that would put 94 samples in each fold, 10-fold would only leave a 47 sample test on each iteration\r\n",
    "# and, no, I don't know if that is a correct assessment of the situation\r\n",
    "scores = ms.cross_val_score(alf, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n",
    "score = scores.mean()\r\n",
    "print(f\"Perceptron: min = {min(scores)}, max = {max(scores)}, mean = {score}\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# let's look at basic SVM\r\n",
    "svm = SVC()\r\n",
    "scores1 = ms.cross_val_score(svm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n",
    "score1 = scores1.mean()\r\n",
    "print(f\"SVM: min = {min(scores1)}, max = {max(scores1)}, mean = {score1}\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# and finally let's look at SGD, using defaults for most parameters\r\n",
    "sgm = linear_model.SGDClassifier(max_iter=25)\r\n",
    "scores2 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n",
    "score2 = scores2.mean()\r\n",
    "print(f\"SGM: min = {min(scores2)}, max = {max(scores2)}, mean = {score2}\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# let's increase max_iter\r\n",
    "sgm = linear_model.SGDClassifier(max_iter=75)\r\n",
    "scores3 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n",
    "score3 = scores3.mean()\r\n",
    "print(f\"SGM: min = {min(scores3)}, max = {max(scores3)}, mean = {score3}\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# let's train the three models on our training dataset and try them against the test dataset\r\n",
    "alf.fit(X_trn, y_trn);\r\n",
    "#alf.score(X_tst, y_tst)\r\n",
    "prcp_preds = alf.predict(X_tst)\r\n",
    "accuracy_score(y_tst, prcp_preds)\r\n",
    "\r\n",
    "svm.fit(X_trn, y_trn);\r\n",
    "svm_preds = svm.predict(X_tst)\r\n",
    "accuracy_score(y_tst, svm_preds)\r\n",
    "\r\n",
    "sgm.fit(X_trn, y_trn)\r\n",
    "sgm_preds = sgm.predict(X_tst)\r\n",
    "accuracy_score(y_tst, sgm_preds)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "And with the exception of the Perceptron model, the test scores look similar to the cross_validation scores.\r\n",
    "\r\n",
    "Wondering about the Perceptron result. AS from the documentation, it looks like the Perceptron `fit()` method uses stochastic gradient descent when generating the model. Quoting: \"Perceptron() is equivalent to SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\", penalty=None).\" For SGD model fitting above we are using the default `loss=hinge`."
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.9.2",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)"
  },
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}