{ "cells": [ { "cell_type": "code", "execution_count": null, "source": [ "from IPython.core.interactiveshell import InteractiveShell\r\n", "import numpy as np\r\n", "import pandas as pd\r\n", "import matplotlib.pyplot as plt # for plotting \r\n", "import seaborn as sns # for plotting\r\n", "from sklearn import datasets\r\n", "from sklearn import preprocessing\r\n", "from sklearn import linear_model\r\n", "from sklearn import model_selection as ms\r\n", "from sklearn.svm import SVC\r\n", "from sklearn.metrics import accuracy_score" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "InteractiveShell.ast_node_interactivity = \"all\"\r\n", "%matplotlib inline\r\n", "plt.style.use('default')\r\n", "sns.set()\r\n", "pd.options.display.float_format = '{:,.2f}'.format" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "c_data = datasets.load_breast_cancer(as_frame=True)\r\n", "y = c_data.target # Training labels ('malignant = 0', 'benign = 1')\r\n", "X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "y.head()\r\n", "X.head()" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# let's get a bit more info on the labels\r\n", "# note: 1 = benign, 0 = malignant\r\n", "target_cnt = y.value_counts()\r\n", "print(target_cnt)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# or we could have used a countplot\r\n", "# semi-colons weren't preventing output, so assigning to variables\r\n", "fig = plt.figure(figsize=(8,6))\r\n", "ax = sns.countplot(x=y, order=[0, 1])\r\n", "pt = plt.title('Distribution of Outcomes')\r\n", "px = plt.xlabel('Is Benign (1 = True)')\r\n", "py = plt.ylabel('Count')\r\n", "\r\n", "for p in ax.patches:\r\n", " #pv = ax.annotate(f'{p.get_height()}', (p.get_x(), p.get_height()+2));\r\n", " pv = ax.annotate(f'\\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=14)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# a deeper look at the attributes\r\n", "X.info()" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# I am going to split the full dataset in two, keeping 69 rows for a final test case\r\n", "# since the data looks to be unordered, I am just going to pull out the last 1/6 of the samples\r\n", "y_trn = y[:470]\r\n", "y_tst = y[470:]\r\n", "X_trn = X[:470]\r\n", "X_tst = X[470:]\r\n", "print(len(y_trn), len(y_tst), len(X_trn), len(X_tst))" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# let's start with a quick look at training/validating the perceptron algorithm on this data set\r\n", "alf = linear_model.Perceptron(tol=1e-3, random_state=0)\r\n", "# using 5-fold, as that would put 94 samples in each fold, 10-fold would only leave a 47 sample test on each iteration\r\n", "# and, no, I don't know if that is a correct assessment of the situation\r\n", "scores = ms.cross_val_score(alf, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n", "score = scores.mean()\r\n", "print(f\"Perceptron: min = {min(scores)}, max = {max(scores)}, mean = {score}\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# let's look at basic SVM\r\n", "svm = SVC()\r\n", "scores1 = ms.cross_val_score(svm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n", "score1 = scores1.mean()\r\n", "print(f\"SVM: min = {min(scores1)}, max = {max(scores1)}, mean = {score1}\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# and finally let's look at SGD, using defaults for most parameters\r\n", "sgm = linear_model.SGDClassifier(max_iter=25)\r\n", "scores2 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n", "score2 = scores2.mean()\r\n", "print(f\"SGM: min = {min(scores2)}, max = {max(scores2)}, mean = {score2}\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# let's increase max_iter\r\n", "sgm = linear_model.SGDClassifier(max_iter=75)\r\n", "scores3 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring=\"accuracy\")\r\n", "score3 = scores3.mean()\r\n", "print(f\"SGM: min = {min(scores3)}, max = {max(scores3)}, mean = {score3}\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# let's train the three models on our training dataset and try them against the test dataset\r\n", "alf.fit(X_trn, y_trn);\r\n", "#alf.score(X_tst, y_tst)\r\n", "prcp_preds = alf.predict(X_tst)\r\n", "accuracy_score(y_tst, prcp_preds)\r\n", "\r\n", "svm.fit(X_trn, y_trn);\r\n", "svm_preds = svm.predict(X_tst)\r\n", "accuracy_score(y_tst, svm_preds)\r\n", "\r\n", "sgm.fit(X_trn, y_trn)\r\n", "sgm_preds = sgm.predict(X_tst)\r\n", "accuracy_score(y_tst, sgm_preds)" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "And with the exception of the Perceptron model, the test scores look similar to the cross_validation scores.\r\n", "\r\n", "Wondering about the Perceptron result. AS from the documentation, it looks like the Perceptron `fit()` method uses stochastic gradient descent when generating the model. Quoting: \"Perceptron() is equivalent to SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\", penalty=None).\" For SGD model fitting above we are using the default `loss=hinge`." ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [], "outputs": [], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.9.2", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)" }, "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" } }, "nbformat": 4, "nbformat_minor": 2 }