In [None]:
# let's set things up
from IPython.core.interactiveshell import InteractiveShell
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# original data source: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt
db_loc = "./data/diabetes.tab.txt"
db_data = pd.read_csv(db_loc, sep='\t', header=(0))

In [None]:
# let's add categorical column based on TSH level
def lbl_tsh(row):
 if row['S4'] < 0.5:
 return "low"
 elif row['S4'] <= 2.0:
 return "ref low"
 elif row['S4'] <= 4.0:
 return "ref high"
 elif row['S4'] <= 6.0:
 return "high"
 else:
 return "very high"

db_data['tsh'] = db_data.apply(lbl_tsh, axis=1)

In [None]:
# let's also classify/categorize BMI

def lbl_bmi(row):
 if row['BMI'] < 18.5:
 return "underweight"
 elif row['BMI'] < 25.0:
 return "normal"
 elif row['BMI'] < 30.0:
 return "overweight"
 else:
 return "obese"

db_data['bmi_class'] = db_data.apply(lbl_bmi, axis=1)

In [None]:
# for reference
db_data.head()

In [None]:
# let's start with something simple
# I am going to show both regplot and lmplot, but as regplot's features are a subset of those of lmplot
# I will use lmplot in future and possibly other Seaborn plots
# Note the use of semicolons to prevent output from the plot functions
sns.regplot(x='BMI', y="Y", data=db_data);
sns.lmplot(x='BMI', y="Y", data=db_data);

Other than the shape of the plot area, you will note that they are identical.

Now what about our crazyily distributed S4 attribute.

In [None]:
# note the use of jitter, try it without to see what happens
sns.lmplot(x='S4', y="Y", data=db_data, x_jitter=.075);

In [None]:
# let's try splitting by sex and using an exstimator
# for comparison let's first display lmplot split by sex
sns.lmplot(x='S4', y="Y", data=db_data, col='SEX', hue='SEX', x_jitter=.075);

In [None]:
# let's try splitting by sex and using an exstimator
sns.lmplot(x='S4', y="Y", data=db_data, x_estimator=np.mean, col='SEX', hue='SEX');

In the last plot, you can see where some of the data was collapsed into a mean along with a confidence interval.

In this case a goodly number of means are along the fitted regression line. Not quite sure how to interpret that information, but it likely means something. Especially at higher values.

I did look at trying to reduce the extra points, but didn't like any of the results. Don't know enough to understand the consequences of the paramters I tried (*x_bins* and *x_ci*). But here's a look using *bins=* and *x_bins*.

In [None]:
# just so we can see something different, let's try 40 bins for each
sns.lmplot(x='S4', y="Y", data=db_data, x_estimator=np.mean, col='SEX', hue='SEX', x_bins=40);

In [None]:
# now let's try 80 bins
b_cntr = [2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]
sns.lmplot(x='S4', y="Y", data=db_data, x_estimator=np.mean, col='SEX', hue='SEX', x_bins=b_cntr);

Well, that's better. Though still not sure exactly what it is telling us.

In [None]:
# let's draw lmplot for each continuous variable with respect to sex
# I'll do this in batches for easier viewing
batch1 = ['AGE', 'BMI', 'BP']
batch2 = ['S1', 'S2', 'S3']
batch3 = ['S4', 'S5', 'S6']

In [None]:
# couldn't get it to work with a facetgrid or with matplotlib subplots
for var in batch1:
 sns.lmplot(data=db_data, x=var, y='Y', col='SEX', hue='SEX', palette="Set2", height=4, aspect=1.3);

In [None]:
for var in batch2:
 sns.lmplot(data=db_data, x=var, y='Y', col='SEX', hue='SEX', palette="Set2", height=4, aspect=1.3);

In [None]:
for var in batch3:
 sns.lmplot(data=db_data, x=var, y='Y', col='SEX', hue='SEX', palette="Set2", height=4, aspect=1.3);

In [None]:
# took a bit of research, but sorted something
fig, axs = fig, axes = plt.subplots(3, 2, figsize=(24,20), sharey=True)
parameters = {'axes.labelsize': 18,
 'axes.titlesize': 22}
plt.rcParams.update(parameters)
y_s1 = db_data.loc[db_data['SEX'] == 1, "Y"]
y_s2 = db_data.loc[db_data['SEX'] == 2, "Y"]
# y_s1.head()
# y_s2.head()
sns.regplot(x=db_data.loc[db_data['SEX'] == 1, "AGE"], y=y_s1, color='g', ax=axs[0,0]);
# axs[0,0].set_title('SEX = 1', fontsize=18);
axs[0,0].set_title('SEX = 1');
# axs[0,0].set_ylabel("Y ", rotation="horizontal", fontsize="large");
axs[0,0].set_ylabel("Y ", rotation="horizontal");
sns.regplot(x=db_data.loc[db_data['SEX'] == 2, "AGE"], y=y_s2, color='orange', ax=axs[0,1]);
axs[0,1].set_title('SEX = 2');
sns.regplot(x=db_data.loc[db_data['SEX'] == 1, "BMI"], y=y_s1, color='g', ax=axs[1,0]);
sns.regplot(x=db_data.loc[db_data['SEX'] == 2, "BMI"], y=y_s2, color='orange', ax=axs[1,1]);
sns.regplot(x=db_data.loc[db_data['SEX'] == 1, "BP"], y=y_s1, color='g', ax=axs[2,0]);
sns.regplot(x=db_data.loc[db_data['SEX'] == 2, "BP"], y=y_s2, color='orange', ax=axs[2,1]);
axs[1,0].set_ylabel("Y ", rotation="horizontal");
axs[2,0].set_ylabel("Y ", rotation="horizontal");

In [None]:

fig1, axs1 = plt.subplots(len(batch2), 2, figsize=(24,20), sharey=True)
# y_s1.head()
# y_s2.head()
for i, var in enumerate(batch2):
 sns.regplot(x=db_data.loc[db_data['SEX'] == 1, var], y=y_s1, color='g', ax=axs1[i,0]);
 sns.regplot(x=db_data.loc[db_data['SEX'] == 2, var], y=y_s2, color='orange', ax=axs1[i,1]);
axs1[0,0].set_title('SEX = 1');
axs1[0,1].set_title('SEX = 2');
axs1[0,0].set_ylabel("Y ", rotation="horizontal");
axs1[1,0].set_ylabel("Y ", rotation="horizontal");
axs1[2,0].set_ylabel("Y ", rotation="horizontal");

In [None]:
fig2, axs2 = plt.subplots(len(batch2), 2, figsize=(24,20), sharey=True)
# y_s1.head()
# y_s2.head()
for i, var in enumerate(batch3):
 sns.regplot(x=db_data.loc[db_data['SEX'] == 1, var], y=y_s1, color='g', ax=axs2[i,0]);
 sns.regplot(x=db_data.loc[db_data['SEX'] == 2, var], y=y_s2, color='orange', ax=axs2[i,1]);
 axs2[i,0].set_ylabel("Y ", rotation="horizontal");
axs2[0,0].set_title('SEX = 1');
axs2[0,1].set_title('SEX = 2');


Well those plots all look a lot alike to me. Not sure I am getting a lot of info from them.

But let's try something else. Mostly as an example of what can be done.

In [None]:
sns.set(font_scale=1.4)
g = sns.lmplot(data=db_data, x="BP", y="Y", col="bmi_class", col_order=['underweight', 'normal', 'overweight', 'obese'], row="SEX", hue="SEX", facet_kws={'margin_titles':True})

In [None]:
# let's try that using a facetgrid and regplot()
sns.set(font_scale=1)
g = sns.FacetGrid(db_data, col="tsh", col_order=['ref low', 'ref high', 'high', 'very high'], row="SEX", hue="SEX", margin_titles=True)
g.map_dataframe(sns.regplot, x="BMI", y="Y");

Find some of those charts interesting, but I really don't know what they are trying to tell me.

So, I think I am going to leave it here for now.

Started *Machine Learning with Python-From Linear Models to Deep Learning* (MITx) last week. The math is driving me nuts and taking a lot of time. Have to admit I was not really well focused on the subject at hand in this notebook. Sorry.

Am concerned I'll be missing a post or two as a result.