In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting
from sklearn import datasets
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import model_selection as ms
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
c_data = datasets.load_breast_cancer(as_frame=True)
y = c_data.target # Training labels ('malignant = 0', 'benign = 1')
X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset

In [None]:
y.head()
X.head()

In [None]:
# let's get a bit more info on the labels
# note: 1 = benign, 0 = malignant
target_cnt = y.value_counts()
print(target_cnt)

In [None]:
# or we could have used a countplot
# semi-colons weren't preventing output, so assigning to variables
fig = plt.figure(figsize=(8,6))
ax = sns.countplot(x=y, order=[0, 1])
pt = plt.title('Distribution of Outcomes')
px = plt.xlabel('Is Benign (1 = True)')
py = plt.ylabel('Count')

for p in ax.patches:
 #pv = ax.annotate(f'{p.get_height()}', (p.get_x(), p.get_height()+2));
 pv = ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=14)

In [None]:
# a deeper look at the attributes
X.info()

In [None]:
# I am going to split the full dataset in two, keeping 69 rows for a final test case
# since the data looks to be unordered, I am just going to pull out the last 1/6 of the samples
y_trn = y[:470]
y_tst = y[470:]
X_trn = X[:470]
X_tst = X[470:]
print(len(y_trn), len(y_tst), len(X_trn), len(X_tst))

In [None]:
# let's start with a quick look at training/validating the perceptron algorithm on this data set
alf = linear_model.Perceptron(tol=1e-3, random_state=0)
# using 5-fold, as that would put 94 samples in each fold, 10-fold would only leave a 47 sample test on each iteration
# and, no, I don't know if that is a correct assessment of the situation
scores = ms.cross_val_score(alf, X_trn, y_trn, cv=5, scoring="accuracy")
score = scores.mean()
print(f"Perceptron: min = {min(scores)}, max = {max(scores)}, mean = {score}")

In [None]:
# let's look at basic SVM
svm = SVC()
scores1 = ms.cross_val_score(svm, X_trn, y_trn, cv=5, scoring="accuracy")
score1 = scores1.mean()
print(f"SVM: min = {min(scores1)}, max = {max(scores1)}, mean = {score1}")

In [None]:
# and finally let's look at SGD, using defaults for most parameters
sgm = linear_model.SGDClassifier(max_iter=25)
scores2 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring="accuracy")
score2 = scores2.mean()
print(f"SGM: min = {min(scores2)}, max = {max(scores2)}, mean = {score2}")

In [None]:
# let's increase max_iter
sgm = linear_model.SGDClassifier(max_iter=75)
scores3 = ms.cross_val_score(sgm, X_trn, y_trn, cv=5, scoring="accuracy")
score3 = scores3.mean()
print(f"SGM: min = {min(scores3)}, max = {max(scores3)}, mean = {score3}")

In [None]:
# let's train the three models on our training dataset and try them against the test dataset
alf.fit(X_trn, y_trn);
#alf.score(X_tst, y_tst)
prcp_preds = alf.predict(X_tst)
accuracy_score(y_tst, prcp_preds)

svm.fit(X_trn, y_trn);
svm_preds = svm.predict(X_tst)
accuracy_score(y_tst, svm_preds)

sgm.fit(X_trn, y_trn)
sgm_preds = sgm.predict(X_tst)
accuracy_score(y_tst, sgm_preds)

And with the exception of the Perceptron model, the test scores look similar to the cross_validation scores.

Wondering about the Perceptron result. AS from the documentation, it looks like the Perceptron `fit()` method uses stochastic gradient descent when generating the model. Quoting: "Perceptron() is equivalent to SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)." For SGD model fitting above we are using the default `loss=hinge`.