In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting
from sklearn import datasets
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import model_selection as ms
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# load data
c_data = datasets.load_breast_cancer(as_frame=True)
y = c_data.target # Training labels ('malignant = 0', 'benign = 1')
X = c_data.data # 30 attributes; https://scikit-learn.org/stable/datasets/index.html#breast-cancer-dataset

In [None]:
# let's split into training and test datasets
X_trn, X_tst, y_trn, y_tst = ms.train_test_split(X, y, test_size=.2, random_state=42)

In [None]:
# let's instantiate our standard scalar, and scale the training and test datasets
std_scl = preprocessing.StandardScaler()
X_trn_std = std_scl.fit_transform(X_trn)
X_tst_std = std_scl.transform(X_tst)

In [None]:
# let's see how long a single cross val, k=5, takes
svm = SVC(C=0.5, random_state=0)
svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring="accuracy")
print(svm_scr)

In [None]:
# let's start with the C hyperparameter for svm.SVC
cv_scores_1 = {}
reg_cs_1 = [i/1000 for i in range(1, 10, 2)]
reg_cs_2 = [i/1000 for i in range(10, 100, 20)]
reg_cs_3 = [i/1000 for i in range(100, 1000, 200)]
reg_cs_4 = [i/1000 for i in range(1000, 10000, 2000)]
#print(reg_cs_1, "\n", reg_cs_2, "\n", reg_cs_3, "\n", reg_cs_4)
reg_cs = np.concatenate([reg_cs_1, reg_cs_2, reg_cs_3, reg_cs_4])
#print(reg_cs)

for cr in reg_cs:
 svm = SVC(C=cr, random_state=0)
 svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring="accuracy")
 cv_scores_1[str(cr)] = svm_scr

In [None]:
for lbl, scrs in cv_scores_1.items():
 print(f"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}")

In [None]:
# looks like it just keeps getting better, let's try a few more values, 10 and up
reg_cs_5 = [i for i in range(10, 20, 2)]
reg_cs_2 = np.concatenate([reg_cs, reg_cs_5])
cv_scores_2 = {}

for cr in reg_cs_2:
 svm = SVC(C=cr, random_state=0)
 svm_scr = ms.cross_val_score(svm, X_trn_std, y_trn, cv=5, scoring="accuracy")
 cv_scores_2[str(cr)] = svm_scr

In [None]:
for lbl, scrs in cv_scores_2.items():
 print(f"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}")

In [None]:
sv_means = [scrs.mean() for _, scrs in cv_scores_2.items()]
ax = plt.axes()
ax.plot(reg_cs_2, sv_means, color='k', linestyle='-')
ax.scatter(reg_cs_2, sv_means, color='blue', linestyle='-')

In [None]:
# looks like C>=7 and C<=9 gives us the best results
# C=1 is the default, so let's test using C=1 & C=7
c_tst = [1.0, 7.0]
ct_scr = {}
for ct in c_tst:
 svm = SVC(C=ct, random_state=0)
 t_f = svm.fit(X_trn_std, y_trn)
 t_preds = svm.predict(X_tst_std)
 ct_scr[str(ct)] = accuracy_score(y_tst, t_preds)

In [None]:
for lbl, scr in ct_scr.items():
 print(f"{lbl}: {scr}")

Looks like our tuning attempt failed. More research needed. But it is possible our train/test split could be an issue.

Okay, let's do the same for `SGDClassifier`. We will start with the regularization parameter, `alpha`.

In [None]:
cv_scores_3 = {}
reg_alphas = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
reg_alphas = 10.0**-np.arange(1,7)

for alpha in reg_alphas:
 sgd = linear_model.SGDClassifier(alpha=alpha, max_iter=1000, random_state=0)
 sgd_scr = ms.cross_val_score(sgd, X_trn_std, y_trn, cv=5, scoring="accuracy")
 cv_scores_3[str(alpha)] = sgd_scr

In [None]:
for lbl, scrs in cv_scores_3.items():
 print(f"{lbl}: {[round(scr,5) for scr in scrs]} -> {scrs.mean()}")

In [None]:
gd_means = [scrs.mean() for _, scrs in cv_scores_3.items()]
ax = plt.axes()
ax.plot(reg_alphas, gd_means, color='k', linestyle='-')
ax.scatter(reg_alphas, gd_means, color='blue', linestyle='-')

Ok, let's fit the training dataset and measure performance against the test dataset.

In [None]:
sgd = linear_model.SGDClassifier(alpha=0.010, max_iter=1000, random_state=0)
sg_f = sgd.fit(X_trn_std, y_trn)
sg_preds = sgd.predict(X_tst_std)
print(f'SGD, alpha=0.010 -> {accuracy_score(y_tst, sg_preds)}')

That score pretty much the same as the **SVM** model with `C=1`.

In [None]:
# let's try one of sklearn's tuning methods
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'alpha': sp_rand()}
sgd = linear_model.SGDClassifier(max_iter=1000, random_state=0)

rsearch = RandomizedSearchCV(estimator=sgd, param_distributions=param_grid, n_iter=100)
rsearch.fit(X_tst_std, y_tst)
rs_tbl = pd.DataFrame(rsearch.cv_results_)
display(rs_tbl)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

In [None]:
# And test the suggested value
sgd_sr = linear_model.SGDClassifier(alpha=rsearch.best_estimator_.alpha, max_iter=1000, random_state=0)
sgsr_f = sgd.fit(X_trn_std, y_trn)
sgsr_preds = sgd.predict(X_tst_std)
print(f'SGD, alpha={rsearch.best_estimator_.alpha:.4f} -> {accuracy_score(y_tst, sg_preds)}')

Pretty much in line with the number we got using regular cross-validation on a selection of potential values.

sklearn.model_selection.RandomizedSearchCV may have taken a little longer to run, but it looks to have tested a lot more values.