In [None]:
# let's set things up
from IPython.core.interactiveshell import InteractiveShell
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# original data source: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt
db_loc = "./data/diabetes.tab.txt"
db_data = pd.read_csv(db_loc, sep='\t', header=(0))

In [None]:
boxp = pd.DataFrame(data=db_data, columns=['AGE', 'BMI', 'S3'])
sns.violinplot(x="SEX", y="Y", data=db_data, hue='SEX', orient='v', palette="Set3");

In [None]:
# let's look at some variations
ax = sns.violinplot(x="SEX", y="Y", inner='quartile', data=db_data)
ax.set_title('Distribution of disease progression', fontsize=16);

We can clearly see the medians are almost equal. As is the 1st quartile. The upper quartile is a little higher for sex 2 than for sex 1, implying slightly more dispersed results for sex 2. And, the overall distribution is very similar for both.

Unfortunately this kind of assessment only works when catergorical features are present in the dataset. Would be nice if we had more than one. So, let's add another one.

In [None]:
# let's add categorical column based on TSH level
def lbl_tsh(row):
 if row['S4'] < 0.5:
 return "low"
 elif row['S4'] <= 2.0:
 return "ref low"
 elif row['S4'] <= 4.0:
 return "ref high"
 elif row['S4'] <= 6.0:
 return "high"
 else:
 return "very high"

db_data['tsh'] = db_data.apply(lbl_tsh, axis=1)

In [None]:
ax = sns.violinplot(x="tsh", y="Y", hue="SEX", split=True, data=db_data, order=['ref low', 'ref high', 'high', 'very high'])
ax.set_title('Distribution of progression by TSH level', fontsize=16);
plt.legend(loc='lower right');

Okay. The median of disease progression appears to increase with tsh level. But given the overlapping distributions hardly seems conclusive.

Distributions for both sexes similar for the higher three classifications. But significantly different for the sex 2 cases with TSH in the bottom half of the reference range. Seems to imply that, for sex 2 cases, TSH in the *ref low* class has no connection with disease progression.

In [None]:
# let's try classifying BMI

def lbl_bmi(row):
 if row['BMI'] < 18.5:
 return "underweight"
 elif row['BMI'] < 25.0:
 return "normal"
 elif row['BMI'] < 30.0:
 return "overweight"
 else:
 return "obese"

db_data['bmi_class'] = db_data.apply(lbl_bmi, axis=1)

In [None]:
ax = sns.violinplot(x="bmi_class", y="Y", hue="SEX", split=True, data=db_data, order=['underweight', 'normal', 'overweight', 'obese'])
ax.set_title('Distribution of progression by BMI level', fontsize=16);
plt.legend(loc='upper left');

Wow, no cases of underweight sex 1 individuals. Again, median disease progression appeart to increase in the overweight and obese classes. Identical median for both of the two lower classifications (underweight and normal).

Also looks to be slightly different distributions for the 2 sexes in the two higher classes. However in the three higher classes, would appear that the peaks (whether uni- or multi-modal) go up with the classification.

That was fun, but I think I will leave it there. Lots more to learn; but, perhaps, a step in the right direction.