In [None]:
from IPython.core.interactiveshell import InteractiveShell
import math
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting
 
# Feature and Model Selection:
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier # linear classifiers
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline

# My own packages/modules
import rek_ml.dsets_t as dst
import rek_ml.ml_misc as rml

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# load the datasets currently of interest
k_trn = pd.read_csv(dst.oma_trn_5, dtype=dst.d_types_5)
y_trn = k_trn["Survived"]
_ = k_trn.drop("Survived", axis=1, inplace=True)
# _ = k_trn.drop("Cabin", axis=1, inplace=True)

k_tst = pd.read_csv(dst.oma_tst_5, dtype=dst.d_types_5)
y_tst = k_tst["Survived"]
_ = k_tst.drop("Survived", axis=1, inplace=True)
# _ = k_tst.drop("Cabin", axis=1, inplace=True)

k_all = k_trn
k_all = pd.concat([k_all, k_tst], ignore_index=True)

In [None]:
# let's try the full feature set
X_trn = k_trn[dst.full_trn_features]

In [None]:
X_trn.head()

In [None]:
clf_preds = {}
for ds_nm, ds_cl in dst.rdc_ds.items():
 X_dtrn = k_trn[ds_cl]
 X_dtst = k_tst[ds_cl]
 tmp_pred = rml.run_classifiers_test(rml.classifier_list, X_dtrn, y_trn, X_dtst, y_tst)
 clf_preds[ds_nm] = list(tmp_pred.values())
print(clf_preds)

In [None]:
tst_acc = {
 'PCA(12)': [0.7751196172248804, 0.7751196172248804, 0.7559808612440191, 0.7416267942583732, 0.6961722488038278, 0.7464114832535885]
}

clf_preds['PCA(12)'] = tst_acc['PCA(12)']
df_cols = ['LogRegression', 'SGD', 'GBoosting', 'RandomForest', 'DecisionTree', 'ExtraTrees']
df_preds = pd.DataFrame.from_dict(clf_preds, orient='index', columns=df_cols)
pd.options.display.float_format = '{:,.5f}'.format
df_preds