import pandas as pd # for reading and maniuplating dataframes
import numpy as np #  general math calulations
import matplotlib.pyplot as plt # for visualization
import seaborn as sns # visualizations, some analysis
from scipy import stats

''' XGBoost is provides the basis for oour model
    We use a XGBClassifier for the classification model.
    plot_importance is a quick way to plot feature importances
'''
from xgboost import XGBClassifier, plot_importance

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectFromModel #For testing the removal of low importance features
from sklearn.model_selection import KFold,  cross_val_score # for validation of the model

import phik # used for calulating the correlation matrix of the binary columns

# load using pandas and create dataframe from data sheet

chart_review_df = pd.read_csv('DY_chart_review.csv')

print("Records: {} Columns: {}".format( *chart_review_df.shape))
chart_review_df.sample(5)

Records: 1773 Columns: 84

chart_review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1773 entries, 0 to 1772
Data columns (total 84 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   patientID              1773 non-null   int64  
 1   age_at_rf_r1_dx        1242 non-null   float64
 2   patient_gender         1773 non-null   object 
 3   current_severity       1766 non-null   object 
 4   severity_at_DY_dx      1747 non-null   object 
 5   age_current            1773 non-null   int64  
 6   DY_dx_age              1773 non-null   int64  
 7   has_had_surgery        1773 non-null   int64  
 8   rf_r1_present          1773 non-null   int64  
 9   rf_r2_present          1773 non-null   int64  
 10  rf_r3_present          1773 non-null   int64  
 11  rf _r4_present         1773 non-null   int64  
 12  rf_r5_present          1773 non-null   int64  
 13  rf_r6_present          127 non-null    float64
 14  had_dx1_at_DY_dx       1773 non-null   int64  
 15  had_dx2_at_DY_dx       1773 non-null   int64  
 16  had_dx3_at_DY_dx       1773 non-null   int64  
 17  had_dx4_at_DY_dx       1773 non-null   int64  
 18  had_dx5_at_DY_dx       1773 non-null   int64  
 19  had_dx6_at_DY_dx       1773 non-null   int64  
 20  had_dx7_at_DY_dx       1773 non-null   int64  
 21  had_dx8_at_DY_dx       1773 non-null   int64  
 22  had_dx9_at_DY_dx       1773 non-null   int64  
 23  had_dx10_at_DY_dx      1773 non-null   int64  
 24  had_dx11_at_DY_dx      1773 non-null   int64  
 25  had_dx12_at_DY_dx      1773 non-null   int64  
 26  had_dx13_at_DY_dx      1773 non-null   int64  
 27  had_dx14_at_DY_dx      1773 non-null   int64  
 28  had_dx15_at_DY_dx      1773 non-null   int64  
 29  had_dx16_at_DY_dx      1773 non-null   int64  
 30  had_dx17_at_DY_dx      1773 non-null   int64  
 31  had_dx18_at_DY_dx      1773 non-null   int64  
 32  had_dx19_at_DY_dx      1773 non-null   int64  
 33  had_dx20_at_DY_dx      1773 non-null   int64  
 34  had_dx21_at_DY_dx      1773 non-null   int64  
 35  currently_has_dx1      1773 non-null   int64  
 36  currently_has_dx2      1773 non-null   int64  
 37  currently_has_dx3      1773 non-null   int64  
 38  currently_has_dx4      1773 non-null   int64  
 39  currently_has_dx5      1773 non-null   int64  
 40  currently_has_dx6      1773 non-null   int64  
 41  currently_has_dx7      1773 non-null   int64  
 42  currently_has_dx8      1773 non-null   int64  
 43  currently_has_dx9      1773 non-null   int64  
 44  currently_has_dx10     1773 non-null   int64  
 45  currently_has_dx11     1773 non-null   int64  
 46  currently_has_dx12     1773 non-null   int64  
 47  currently_has_dx13     1773 non-null   int64  
 48  currently_has_dx14     1773 non-null   int64  
 49  currently_has_dx15     1773 non-null   int64  
 50  currently_has_dx16     1773 non-null   int64  
 51  currently_has_dx17     1773 non-null   int64  
 52  currently_has_dx18     1773 non-null   int64  
 53  currently_has_dx19     1773 non-null   int64  
 54  currently_has_dx20     1773 non-null   int64  
 55  currently_has_dx21     1773 non-null   int64  
 56  has_ever_had_dx22      1773 non-null   int64  
 57  has_ever_had_dx23      1773 non-null   int64  
 58  px_count_for_DY_tried  1773 non-null   int64  
 59  currently_using_px1    1773 non-null   int64  
 60  currently_using_px2    1773 non-null   int64  
 61  currently_using_px3    1773 non-null   int64  
 62  currently_using_px4    1773 non-null   int64  
 63  currently_using_px5    1773 non-null   int64  
 64  currently_using_px6    1773 non-null   int64  
 65  currently_using_px7    1773 non-null   int64  
 66  currently_using_px8    1773 non-null   int64  
 67  currently_using_px9    1773 non-null   int64  
 68  currently_using_px10   1773 non-null   int64  
 69  currently_using_px11   1773 non-null   int64  
 70  currently_using_px12   1773 non-null   int64  
 71  previously_tried_px1   1178 non-null   float64
 72  previously_tried_px2   1178 non-null   float64
 73  previously_tried_px3   1178 non-null   float64
 74  previously_tried_px4   1178 non-null   float64
 75  previously_tried_px5   1178 non-null   float64
 76  previously_tried_px6   1178 non-null   float64
 77  previously_tried_px7   1178 non-null   float64
 78  previously_tried_px8   1178 non-null   float64
 79  previously_tried_px9   1178 non-null   float64
 80  previously_tried_px10  1178 non-null   float64
 81  previously_tried_px11  1178 non-null   float64
 82  previously_tried_px12  1178 non-null   float64
 83  recommend_Product_X    1773 non-null   int64  
dtypes: float64(14), int64(67), object(3)
memory usage: 1.1+ MB

# Fix column name for "rf _r4_present" -> "rf_r4_present"
chart_review_df.rename(columns={'rf _r4_present':'rf_r4_present'}, inplace=True)

# I assume patientID will be a unique identifier, but double check to be safe
# Also indicates that we have no duplicate rows
chart_review_df['patientID'].nunique()

1773

# Also want to chack how balanced our dataset is in regards to our target variable "recommend_Product_X"
print(chart_review_df['recommend_Product_X'].value_counts())

sns.catplot(x='recommend_Product_X', kind="count",  data=chart_review_df)

0    1035
1     738
Name: recommend_Product_X, dtype: int64

<seaborn.axisgrid.FacetGrid at 0x151a1ad163a0>

# Looks like most columns are binary (0,1,or null), can check quickly

chart_review_df.columns

Index(['patientID', 'age_at_rf_r1_dx', 'patient_gender', 'current_severity',
       'severity_at_DY_dx', 'age_current', 'DY_dx_age', 'has_had_surgery',
       'rf_r1_present', 'rf_r2_present', 'rf_r3_present', 'rf_r4_present',
       'rf_r5_present', 'rf_r6_present', 'had_dx1_at_DY_dx',
       'had_dx2_at_DY_dx', 'had_dx3_at_DY_dx', 'had_dx4_at_DY_dx',
       'had_dx5_at_DY_dx', 'had_dx6_at_DY_dx', 'had_dx7_at_DY_dx',
       'had_dx8_at_DY_dx', 'had_dx9_at_DY_dx', 'had_dx10_at_DY_dx',
       'had_dx11_at_DY_dx', 'had_dx12_at_DY_dx', 'had_dx13_at_DY_dx',
       'had_dx14_at_DY_dx', 'had_dx15_at_DY_dx', 'had_dx16_at_DY_dx',
       'had_dx17_at_DY_dx', 'had_dx18_at_DY_dx', 'had_dx19_at_DY_dx',
       'had_dx20_at_DY_dx', 'had_dx21_at_DY_dx', 'currently_has_dx1',
       'currently_has_dx2', 'currently_has_dx3', 'currently_has_dx4',
       'currently_has_dx5', 'currently_has_dx6', 'currently_has_dx7',
       'currently_has_dx8', 'currently_has_dx9', 'currently_has_dx10',
       'currently_has_dx11', 'currently_has_dx12', 'currently_has_dx13',
       'currently_has_dx14', 'currently_has_dx15', 'currently_has_dx16',
       'currently_has_dx17', 'currently_has_dx18', 'currently_has_dx19',
       'currently_has_dx20', 'currently_has_dx21', 'has_ever_had_dx22',
       'has_ever_had_dx23', 'px_count_for_DY_tried', 'currently_using_px1',
       'currently_using_px2', 'currently_using_px3', 'currently_using_px4',
       'currently_using_px5', 'currently_using_px6', 'currently_using_px7',
       'currently_using_px8', 'currently_using_px9', 'currently_using_px10',
       'currently_using_px11', 'currently_using_px12', 'previously_tried_px1',
       'previously_tried_px2', 'previously_tried_px3', 'previously_tried_px4',
       'previously_tried_px5', 'previously_tried_px6', 'previously_tried_px7',
       'previously_tried_px8', 'previously_tried_px9', 'previously_tried_px10',
       'previously_tried_px11', 'previously_tried_px12',
       'recommend_Product_X'],
      dtype='object')

# these are the columns that appear to be binary (as well as 'px_count_for_DY_tried', which is likely not binary since its a count)
check_binary_cols = ['has_had_surgery',
       'rf_r1_present', 'rf_r2_present', 'rf_r3_present', 'rf_r4_present',
       'rf_r5_present', 'rf_r6_present', 'had_dx1_at_DY_dx',
       'had_dx2_at_DY_dx', 'had_dx3_at_DY_dx', 'had_dx4_at_DY_dx',
       'had_dx5_at_DY_dx', 'had_dx6_at_DY_dx', 'had_dx7_at_DY_dx',
       'had_dx8_at_DY_dx', 'had_dx9_at_DY_dx', 'had_dx10_at_DY_dx',
       'had_dx11_at_DY_dx', 'had_dx12_at_DY_dx', 'had_dx13_at_DY_dx',
       'had_dx14_at_DY_dx', 'had_dx15_at_DY_dx', 'had_dx16_at_DY_dx',
       'had_dx17_at_DY_dx', 'had_dx18_at_DY_dx', 'had_dx19_at_DY_dx',
       'had_dx20_at_DY_dx', 'had_dx21_at_DY_dx', 'currently_has_dx1',
       'currently_has_dx2', 'currently_has_dx3', 'currently_has_dx4',
       'currently_has_dx5', 'currently_has_dx6', 'currently_has_dx7',
       'currently_has_dx8', 'currently_has_dx9', 'currently_has_dx10',
       'currently_has_dx11', 'currently_has_dx12', 'currently_has_dx13',
       'currently_has_dx14', 'currently_has_dx15', 'currently_has_dx16',
       'currently_has_dx17', 'currently_has_dx18', 'currently_has_dx19',
       'currently_has_dx20', 'currently_has_dx21', 'has_ever_had_dx22',
       'has_ever_had_dx23', 'px_count_for_DY_tried', 'currently_using_px1',
       'currently_using_px2', 'currently_using_px3', 'currently_using_px4',
       'currently_using_px5', 'currently_using_px6', 'currently_using_px7',
       'currently_using_px8', 'currently_using_px9', 'currently_using_px10',
       'currently_using_px11', 'currently_using_px12', 'previously_tried_px1',
       'previously_tried_px2', 'previously_tried_px3', 'previously_tried_px4',
       'previously_tried_px5', 'previously_tried_px6', 'previously_tried_px7',
       'previously_tried_px8', 'previously_tried_px9', 'previously_tried_px10',
       'previously_tried_px11', 'previously_tried_px12',
       'recommend_Product_X']



for col in check_binary_cols:
    #get set of balue in columns, ignore nans
    colvals = chart_review_df[col].dropna().unique()
    if set(colvals) != set([0,1]):
        print(col, colvals)

rf_r5_present [1 3 2 4 0]
px_count_for_DY_tried [2 1 4 3 5 0]

# If the values were almost all 0 or 1 with only a few 2,3,or 4 we could possibly infer this is a data error, but that is not the case here
chart_review_df['rf_r5_present'].value_counts()

1    857
2    577
3    183
4    129
0     27
Name: rf_r5_present, dtype: int64

# px_count_for_DY alone does not seem a strong indicator, For most categories it roughly follows the same 40/60 split as the full sample
# exception is value of 4 which has more predicted values of 1, but this subsample is pretty small

chart_review_df.groupby(['px_count_for_DY_tried','recommend_Product_X']).count()['patientID']

px_count_for_DY_tried  recommend_Product_X
0                      0                        9
                       1                        5
1                      0                      390
                       1                      214
2                      0                      372
                       1                      279
3                      0                      204
                       1                      174
4                      0                       32
                       1                       40
5                      0                       28
                       1                       26
Name: patientID, dtype: int64

plt.figure(figsize=(6, 4), dpi=150)
sns.histplot(chart_review_df,x='age_at_rf_r1_dx', hue='recommend_Product_X')

<AxesSubplot:xlabel='age_at_rf_r1_dx', ylabel='Count'>

#these are pretty much only used for this quick look into the rf_r6_present and reccommend_Product_X correlation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

rf6 = chart_review_df[~chart_review_df['rf_r6_present'].isnull()]
print(stats.pearsonr(rf6['rf_r6_present'], rf6['recommend_Product_X']))

cm = confusion_matrix(rf6['rf_r6_present'], rf6['recommend_Product_X'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot()
plt.show()

print(classification_report(rf6['rf_r6_present'], rf6['recommend_Product_X']))

(0.576401023357887, 1.3308157521498737e-12)

              precision    recall  f1-score   support

         0.0       0.70      0.54      0.61        13
         1.0       0.95      0.97      0.96       114

    accuracy                           0.93       127
   macro avg       0.82      0.76      0.78       127
weighted avg       0.92      0.93      0.92       127

rf6['recommend_Product_X'].value_counts()

1    117
0     10
Name: recommend_Product_X, dtype: int64

# Although the sample size is very small, usingf this feature alone gives good accuracy (~93%)
accuracy = accuracy_score(rf6['rf_r6_present'], rf6['recommend_Product_X'])
print("Accuracy: {:.2f}%".format(accuracy * 100.0))

Accuracy: 92.91%

fig, ax = plt.subplots(1,3, sharey=True,figsize=(8,4), dpi=150)

ax[0].hist(chart_review_df['age_at_rf_r1_dx'], density=True, edgecolor='grey',  alpha=0.5, label='Total Dataset')
ax[0].hist(rf6['age_at_rf_r1_dx'], density=True, edgecolor='grey', alpha=0.5,label= 'rf6 not null')
ax[0].legend(handlelength=1, fontsize=8, loc='upper left')
ax[0].set_ylim(0,0.04)
ax[0].set_xlabel('age_at_rf_r1_dx')

ax[1].hist(chart_review_df['DY_dx_age'], density=True, edgecolor='grey',  alpha=0.5)
ax[1].hist(rf6['DY_dx_age'], density=True, edgecolor='grey', alpha=0.5)
ax[1].set_xlabel('DY_dx_age')

ax[2].hist(chart_review_df['age_current'], density=True, edgecolor='grey',  alpha=0.5)
ax[2].hist(rf6['age_current'], density=True, edgecolor='grey', alpha=0.5)
ax[2].set_xlabel('age_current')

Text(0.5, 0, 'age_current')

# Also no apparent difference in the rf_r6_present value and age (though it is very imbalanced)

plt.figure(figsize=(3.5, 3), dpi=150)

plt.hist(rf6[rf6['rf_r6_present']==0]['DY_dx_age'], density=True, edgecolor='grey',  alpha=0.5, label='rf_r6_present = 0')
plt.hist(rf6[rf6['rf_r6_present']==1]['DY_dx_age'], density=True, edgecolor='grey', alpha=0.5, label='rf_r6_present = 1')
plt.legend()
plt.xlabel('DY_dx_age')

Text(0.5, 0, 'DY_dx_age')

# Get total counts for rfs and dx
rf_cols = ['rf_r{}_present'.format(i) for i in range(1,7)]
chart_review_df['count_rf_present']= chart_review_df[rf_cols].sum(axis=1,skipna=True )

dx_cols = ['had_dx{}_at_DY_dx'.format(i) for i in range(1,22)]
chart_review_df['count_dx_at_DY_dx']= chart_review_df[dx_cols].sum(axis=1,skipna=True )


dx_cols_current = ['currently_has_dx{}'.format(i) for i in range(1,22)] + ['has_ever_had_dx22', 'has_ever_had_dx23']
chart_review_df['count_dx_current']= chart_review_df[dx_cols_current].sum(axis=1,skipna=True )

chart_review_df['time_since_DY_dx'] = chart_review_df['age_current'] - chart_review_df['DY_dx_age']

from sklearn.cluster import AgglomerativeClustering # this is included for clustering of binary data

#NOTE: My version of scipy uses the deprecated 'affinity' argument, wihich is later renamed to 'metric'

# First cluster the first 5 risk factors
cluster_cols = ['rf_r1_present', 'rf_r2_present', 'rf_r3_present', 'rf_r4_present',
       'rf_r5_present']
chart_review_df[cluster_cols].dropna()
clustering = AgglomerativeClustering(n_clusters=2, affinity='hamming', linkage='complete').fit(np.asarray(chart_review_df[cluster_cols]))
chart_review_df['rf1_5_cluster'] = clustering.labels_


# First cluster the diagnoses at DY dx
cluster_cols = dx_cols = ['had_dx{}_at_DY_dx'.format(i) for i in range(1,22)]
chart_review_df[cluster_cols].dropna()
clustering = AgglomerativeClustering(n_clusters=2, affinity='hamming', linkage='complete').fit(np.asarray(chart_review_df[cluster_cols].dropna()))
chart_review_df['dx_at_DY_cluster'] = clustering.labels_

# Cluster the current diagnoses
cluster_cols = dx_cols_current = ['currently_has_dx{}'.format(i) for i in range(1,22)] + ['has_ever_had_dx22', 'has_ever_had_dx23']
clustering = AgglomerativeClustering(n_clusters=2, affinity='hamming', linkage='complete').fit(np.asarray(chart_review_df[cluster_cols].dropna()))
chart_review_df['dx_cluster'] = clustering.labels_

new_binary_cols = check_binary_cols + ['rf1_5_cluster', 'dx_cluster', 'dx_at_DY_cluster']
remove_cols = ['px_count_for_DY_tried', 'rf_r5_present', 'recommend_Product_X']
binary_cols = [x for x in new_binary_cols if x not in remove_cols]


accuracy_vals=[]

for col in binary_cols:
    if col !='recommend_Product_X':
        col_df = chart_review_df[~chart_review_df[col].isnull()]
        accuracy = accuracy_score(col_df[col], col_df['recommend_Product_X'])
        print("{} Accuracy:{:.2f}".format(col, accuracy * 100.0))

        accuracy_vals.append(accuracy*100)

plt.figure(figsize=(4,3), dpi=180)
plt.hist(accuracy_vals, edgecolor='grey', alpha=0.5)
plt.xlim(0,100)
plt.xlabel('Accuracy [%]')

has_had_surgery Accuracy:57.98
rf_r1_present Accuracy:51.78
rf_r2_present Accuracy:55.84
rf_r3_present Accuracy:52.00
rf_r4_present Accuracy:55.56
rf_r6_present Accuracy:92.91
had_dx1_at_DY_dx Accuracy:55.56
had_dx2_at_DY_dx Accuracy:53.98
had_dx3_at_DY_dx Accuracy:48.73
had_dx4_at_DY_dx Accuracy:51.44
had_dx5_at_DY_dx Accuracy:45.52
had_dx6_at_DY_dx Accuracy:48.96
had_dx7_at_DY_dx Accuracy:47.88
had_dx8_at_DY_dx Accuracy:51.72
had_dx9_at_DY_dx Accuracy:56.80
had_dx10_at_DY_dx Accuracy:55.05
had_dx11_at_DY_dx Accuracy:56.91
had_dx12_at_DY_dx Accuracy:50.59
had_dx13_at_DY_dx Accuracy:48.90
had_dx14_at_DY_dx Accuracy:57.98
had_dx15_at_DY_dx Accuracy:51.78
had_dx16_at_DY_dx Accuracy:49.24
had_dx17_at_DY_dx Accuracy:50.03
had_dx18_at_DY_dx Accuracy:53.07
had_dx19_at_DY_dx Accuracy:50.76
had_dx20_at_DY_dx Accuracy:56.35
had_dx21_at_DY_dx Accuracy:56.06
currently_has_dx1 Accuracy:59.22
currently_has_dx2 Accuracy:57.81
currently_has_dx3 Accuracy:52.62
currently_has_dx4 Accuracy:56.46
currently_has_dx5 Accuracy:46.25
currently_has_dx6 Accuracy:51.55
currently_has_dx7 Accuracy:51.33
currently_has_dx8 Accuracy:54.99
currently_has_dx9 Accuracy:58.26
currently_has_dx10 Accuracy:56.85
currently_has_dx11 Accuracy:57.53
currently_has_dx12 Accuracy:53.41
currently_has_dx13 Accuracy:48.67
currently_has_dx14 Accuracy:57.76
currently_has_dx15 Accuracy:53.19
currently_has_dx16 Accuracy:51.21
currently_has_dx17 Accuracy:56.12
currently_has_dx18 Accuracy:56.80
currently_has_dx19 Accuracy:56.01
currently_has_dx20 Accuracy:57.47
currently_has_dx21 Accuracy:58.04
has_ever_had_dx22 Accuracy:58.94
has_ever_had_dx23 Accuracy:60.01
currently_using_px1 Accuracy:47.21
currently_using_px2 Accuracy:52.57
currently_using_px3 Accuracy:53.58
currently_using_px4 Accuracy:58.49
currently_using_px5 Accuracy:57.70
currently_using_px6 Accuracy:57.76
currently_using_px7 Accuracy:58.04
currently_using_px8 Accuracy:57.76
currently_using_px9 Accuracy:58.32
currently_using_px10 Accuracy:58.09
currently_using_px11 Accuracy:56.23
currently_using_px12 Accuracy:56.68
previously_tried_px1 Accuracy:46.01
previously_tried_px2 Accuracy:51.19
previously_tried_px3 Accuracy:51.10
previously_tried_px4 Accuracy:59.34
previously_tried_px5 Accuracy:56.03
previously_tried_px6 Accuracy:52.63
previously_tried_px7 Accuracy:56.28
previously_tried_px8 Accuracy:53.90
previously_tried_px9 Accuracy:53.31
previously_tried_px10 Accuracy:54.16
previously_tried_px11 Accuracy:52.80
previously_tried_px12 Accuracy:54.16
rf1_5_cluster Accuracy:51.16
dx_cluster Accuracy:60.46
dx_at_DY_cluster Accuracy:56.63

Text(0.5, 0, 'Accuracy [%]')

## Check correlations

corr = chart_review_df[binary_cols + ['recommend_Product_X']].phik_matrix()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))


fig, ax = plt.subplots(figsize=(24, 20))



# Draw the heatmap with the mask and correct aspect ratio
# vmin = 0.5 to highlight most highly correlated values

sns.heatmap(corr, mask=mask, cmap='Blues', vmin=0.5, vmax=1,
            square=True, linewidths=.5, )

interval columns not set, guessing: ['has_had_surgery', 'rf_r1_present', 'rf_r2_present', 'rf_r3_present', 'rf_r4_present', 'rf_r6_present', 'had_dx1_at_DY_dx', 'had_dx2_at_DY_dx', 'had_dx3_at_DY_dx', 'had_dx4_at_DY_dx', 'had_dx5_at_DY_dx', 'had_dx6_at_DY_dx', 'had_dx7_at_DY_dx', 'had_dx8_at_DY_dx', 'had_dx9_at_DY_dx', 'had_dx10_at_DY_dx', 'had_dx11_at_DY_dx', 'had_dx12_at_DY_dx', 'had_dx13_at_DY_dx', 'had_dx14_at_DY_dx', 'had_dx15_at_DY_dx', 'had_dx16_at_DY_dx', 'had_dx17_at_DY_dx', 'had_dx18_at_DY_dx', 'had_dx19_at_DY_dx', 'had_dx20_at_DY_dx', 'had_dx21_at_DY_dx', 'currently_has_dx1', 'currently_has_dx2', 'currently_has_dx3', 'currently_has_dx4', 'currently_has_dx5', 'currently_has_dx6', 'currently_has_dx7', 'currently_has_dx8', 'currently_has_dx9', 'currently_has_dx10', 'currently_has_dx11', 'currently_has_dx12', 'currently_has_dx13', 'currently_has_dx14', 'currently_has_dx15', 'currently_has_dx16', 'currently_has_dx17', 'currently_has_dx18', 'currently_has_dx19', 'currently_has_dx20', 'currently_has_dx21', 'has_ever_had_dx22', 'has_ever_had_dx23', 'currently_using_px1', 'currently_using_px2', 'currently_using_px3', 'currently_using_px4', 'currently_using_px5', 'currently_using_px6', 'currently_using_px7', 'currently_using_px8', 'currently_using_px9', 'currently_using_px10', 'currently_using_px11', 'currently_using_px12', 'previously_tried_px1', 'previously_tried_px2', 'previously_tried_px3', 'previously_tried_px4', 'previously_tried_px5', 'previously_tried_px6', 'previously_tried_px7', 'previously_tried_px8', 'previously_tried_px9', 'previously_tried_px10', 'previously_tried_px11', 'previously_tried_px12', 'rf1_5_cluster', 'dx_cluster', 'dx_at_DY_cluster', 'recommend_Product_X']

<AxesSubplot:>

chart_review_df[['rf_r6_present','recommend_Product_X']].significance_matrix()

interval columns not set, guessing: ['rf_r6_present', 'recommend_Product_X']

# To get a better sense of the correlations between parameters, plot the histogramn (this includes duplicates (i.e. the upper triangle of the correlation matrix)
fig= plt.figure(figsize=(3.5, 3), dpi=150)
density=True
corr_vals = (np.array(corr[(corr!=1)]).flatten())
#plt.semilogy()
plt.hist(corr_vals, edgecolor='grey',density=density, alpha=0.5, bins=np.linspace(0,1,11))
plt.xlabel('$\phi_k$')
if density:
    plt.ylabel('Normalized Frequency')
else:
    plt.ylabel('Count')

## Label Encoder
# Classes for current severity are 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# double check that 'M' and 'F' are the only options, then convert to binary
chart_review_df['patient_gender'].unique()

array(['F', 'M'], dtype=object)

chart_review_df['patient_gender_enc'] = (chart_review_df['patient_gender'] == 'M').astype(int)
chart_review_df['patient_gender_enc'].unique()

array([0, 1])

print(chart_review_df['current_severity'].value_counts()  )
print(np.sum(chart_review_df['current_severity'].isnull()))

Moderate    1582
Severe       184
Name: current_severity, dtype: int64
7

le.fit(chart_review_df['severity_at_DY_dx'])
chart_review_df['severity_at_DY_dx_enc'] = le.transform(chart_review_df['severity_at_DY_dx'])
le.fit(chart_review_df['current_severity'])
chart_review_df['current_severity_enc'] = le.transform(chart_review_df['current_severity'])

# These columns will be dropped from the training data
# PatientID is uninformative, recommend_Product_X is the Target, and the others have been encoded
drop_cols = ['patientID','current_severity','severity_at_DY_dx','patient_gender','recommend_Product_X']

fit_df = chart_review_df


X, y = fit_df.drop(columns=drop_cols), fit_df['recommend_Product_X']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
evalset = [(X_train, y_train), (X_test,y_test)]

try:
    del DisYBoost
except:
    print("DisYBoost does not exist, will instantiate")

# Here we will define our first baseline version of the model, using mainly default parameters, but increse n_estimators to get more learning curve coverage
n_estimators = 150
seed = 0

DisYBoost = XGBClassifier(n_estimators=n_estimators, seed=seed, eval_metric='logloss')

DisYBoost.fit(X_train, y_train,  eval_set=evalset,verbose=False)


# make predictions for test data
y_pred = DisYBoost.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Train-Test-Split Accuracy: %.2f%%" % (accuracy * 100.0))

DisYBoost does not exist, will instantiate
Train-Test-Split Accuracy: 82.89%

#Also check K-fold cross validation

kfold = KFold(n_splits=10, shuffle=True,random_state=0)
results = cross_val_score(DisYBoost, X, y, cv=kfold)
print("K-fold Accuracy: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold Accuracy: 85.84% (2.54%)

#Here we can check learning cruves for our model using logloss
results_pre_opt = DisYBoost.evals_result()
plt.figure(figsize=(3.5, 3), dpi=150)
# plot learning curves
plt.plot(results_pre_opt['validation_0']['logloss'], label='Train')
plt.plot(results_pre_opt['validation_1']['logloss'], label='Test')
plt.ylabel('logloss')
plt.legend()

<matplotlib.legend.Legend at 0x151a18d260d0>

fig, ax = plt.subplots(1,1,figsize=(8,10))
# plot feature importance
plot_importance(DisYBoost, max_num_features=10,ax=ax, ylabel=None)
ax.tick_params(axis='both', which='major', labelsize=13)
ax.tick_params(axis='both', which='minor', labelsize=13)

plt.tight_layout()
#plt.savefig('feat_import_F.png', facecolor='white', transparent=False)

# Get default feature_importance values (i.e. gain) and create a sorted dictionary
feature_names  = X_train.columns.tolist()
feat_imp_dict = dict(zip(feature_names, DisYBoost.feature_importances_))
feat_imp_dict = {k: v for k, v in sorted(feat_imp_dict.items(), key=lambda item: item[1], reverse=True)}


#plot the feature importances
fig, ax = plt.subplots(1,1,figsize=(8,10))




#limt to top 15
limit = 10
ax.grid(zorder=0)
ax.barh(list(feat_imp_dict.keys())[:limit], list(feat_imp_dict.values())[:limit], height=0.3)
ax.invert_yaxis()
ax.set_xlabel('gain')
#ax.set_ylabel('Features')
ax.tick_params(axis='both', which='major', labelsize=13)
ax.tick_params(axis='both', which='minor', labelsize=13)


plt.title('Feature Importance')
plt.tight_layout()
#plt.savefig('feat_import_gain.png', facecolor='white', transparent=False)

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

hyp_params = {
    'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'subsample': hp.uniform('subsample', 0.25, 1),
    #'gamma': hp.uniform ('gamma', 0,2),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'scale_pos_weight' : hp.uniform('scale_pos_weight', 1,2),
}

# An objective function is needed for the hyperopt tuning
def objective(hyp_params):
    clf=XGBClassifier(n_estimators=n_estimators,seed=seed,eval_metric='logloss', **hyp_params)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,  verbose=False)
    

    pred = clf.predict(X_test)
    accuracy_clf = accuracy_score(y_test, pred)
    print ("SCORE: {:.3f}".format( accuracy_clf))
    return {'loss': -accuracy_clf, 'status': STATUS_OK }

trials = Trials()
best_params = fmin(objective, hyp_params, algo=tpe.suggest, max_evals=100, trials=trials)
print("Best set of hyperparameters: ", best_params)

SCORE: 0.793                                           
SCORE: 0.763                                                                     
SCORE: 0.827                                                                     
SCORE: 0.778                                                                      
SCORE: 0.769                                                                      
SCORE: 0.776                                                                      
SCORE: 0.773                                                                      
SCORE: 0.786                                                                      
SCORE: 0.754                                                                      
SCORE: 0.793                                                                      
SCORE: 0.722                                                                       
SCORE: 0.821                                                                       
SCORE: 0.733                                                                       
SCORE: 0.823                                                                       
SCORE: 0.746                                                                       
SCORE: 0.758                                                                       
SCORE: 0.803                                                                       
SCORE: 0.793                                                                       
SCORE: 0.806                                                                       
SCORE: 0.769                                                                       
SCORE: 0.855                                                                       
SCORE: 0.850                                                                       
SCORE: 0.842                                                                       
SCORE: 0.850                                                                       
SCORE: 0.842                                                                       
SCORE: 0.825                                                                       
SCORE: 0.835                                                                       
SCORE: 0.863                                                                       
SCORE: 0.820                                                                       
SCORE: 0.816                                                                       
SCORE: 0.808                                                                       
SCORE: 0.846                                                                       
SCORE: 0.803                                                                       
SCORE: 0.827                                                                       
SCORE: 0.844                                                                       
SCORE: 0.786                                                                       
SCORE: 0.810                                                                       
SCORE: 0.812                                                                       
SCORE: 0.850                                                                       
SCORE: 0.831                                                                       
SCORE: 0.821                                                                       
SCORE: 0.799                                                                       
SCORE: 0.814                                                                       
SCORE: 0.782                                                                       
SCORE: 0.791                                                                       
SCORE: 0.806                                                                       
SCORE: 0.846                                                                       
SCORE: 0.797                                                                       
SCORE: 0.727                                                                       
SCORE: 0.821                                                                       
SCORE: 0.812                                                                       
SCORE: 0.821                                                                       
SCORE: 0.795                                                                       
SCORE: 0.836                                                                       
SCORE: 0.759                                                                       
SCORE: 0.814                                                                       
SCORE: 0.838                                                                       
SCORE: 0.782                                                                       
SCORE: 0.801                                                                       
SCORE: 0.855                                                                       
SCORE: 0.801                                                                       
SCORE: 0.821                                                                       
SCORE: 0.797                                                                       
SCORE: 0.784                                                                       
SCORE: 0.810                                                                       
SCORE: 0.833                                                                       
SCORE: 0.844                                                                       
SCORE: 0.855                                                                       
SCORE: 0.846                                                                       
SCORE: 0.852                                                                       
SCORE: 0.838                                                                       
SCORE: 0.821                                                                       
SCORE: 0.836                                                                       
SCORE: 0.844                                                                       
SCORE: 0.797                                                                       
SCORE: 0.808                                                                       
SCORE: 0.846                                                                       
SCORE: 0.812                                                                       
SCORE: 0.840                                                                       
SCORE: 0.840                                                                       
SCORE: 0.836                                                                       
SCORE: 0.853                                                                       
SCORE: 0.788                                                                       
SCORE: 0.827                                                                       
SCORE: 0.806                                                                       
SCORE: 0.835                                                                       
SCORE: 0.799                                                                       
SCORE: 0.789                                                                       
SCORE: 0.835                                                                       
SCORE: 0.789                                                                       
SCORE: 0.840                                                                       
SCORE: 0.835                                                                       
SCORE: 0.821                                                                       
SCORE: 0.810                                                                       
SCORE: 0.842                                                                       
SCORE: 0.855                                                                       
SCORE: 0.780                                                                       
SCORE: 0.831                                                                       
SCORE: 0.793                                                                       
SCORE: 0.836                                                                       
100%|██████████| 100/100 [02:49<00:00,  1.69s/trial, best loss: -0.8627819548872181]
Best set of hyperparameters:  {'colsample_bytree': 0.7821259274531891, 'learning_rate': 0.1257993515650235, 'max_depth': 7, 'min_child_weight': 1.0, 'scale_pos_weight': 1.5687552539436538, 'subsample': 0.9214027984930189}

#DisYBoost_HT is the Hypertuned model

DisYBoost_HT = XGBClassifier(n_estimators=n_estimators,seed=seed, eval_metric='logloss',  **best_params)


eval_set = [( X_train, y_train), ( X_test, y_test)]

DisYBoost_HT.fit(X_train, y_train, eval_set=eval_set, verbose=False)


# make predictions for test data
y_pred = DisYBoost_HT.predict(X_test)

# evaluate predictions
accuracy_HT = accuracy_score(y_test, y_pred)
print("Train-test-split Accuracy: {:.2f}%".format(accuracy_HT * 100.0))

Train-test-split Accuracy: 83.46%

kfold = KFold(n_splits=10, shuffle=True,random_state=0)
results_HT = cross_val_score(DisYBoost_HT, X,y, cv=kfold)
print("Accuracy: {:.2f}% ({:.2f}%)".format(results_HT.mean()*100, results_HT.std()*100))

# Basically the same as previous prior to tuning. Slightly slower std
# Accuracy: 85.79% (2.27%)

Accuracy: 86.29% (2.11%)

# Drop lower importance features
accuracy_scores = []
thresholds = np.sort(DisYBoost_HT.feature_importances_)
for thresh in list(thresholds):
    
    # select features using threshold
    selection = SelectFromModel(DisYBoost_HT, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = XGBClassifier(n_estimators=n_estimators,seed=seed, eval_metric='logloss',**best_params)
    selection_model.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    predictions = selection_model.predict(select_X_test)
    accuracy_thresh = accuracy_score(y_test, predictions)
    
    accuracy_scores.append([thresh,accuracy_thresh*100])
    print("Thresh={:.3f}, num_features={}, Accuracy: {:.2f}%".format(thresh, select_X_train.shape[1], accuracy_thresh*100.0))

/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(

Thresh=0.004, num_features=89, Accuracy: 83.46%

/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(

Thresh=0.004, num_features=88, Accuracy: 84.96%

/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(

Thresh=0.005, num_features=87, Accuracy: 84.96%

/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(

Thresh=0.006, num_features=86, Accuracy: 82.52%

/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(
/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SelectFromModel was fitted without feature names
  warnings.warn(

Thresh=0.006, num_features=85, Accuracy: 84.02%

accuracy_scores=np.asarray(accuracy_scores)

max_accuracy = max(accuracy_scores[:,1])
max_threshold = accuracy_scores[np.argmax(accuracy_scores[:,1]),0]

print("Max accuracy: {:.2f}% at thresh {:3f}".format(max_accuracy, max_threshold))

plt.figure(figsize=(3.5,3), dpi=150)
plt.plot(accuracy_scores[:,0], accuracy_scores[:,1])
plt.xlabel('Gain threshold')
plt.ylabel('Accuracy [%]')
#not sure why this got worse, but I think its good motivation to not include any feature importance based cuts
# Max accuracy: 84.40% at thresh 0.007778

Max accuracy: 85.34% at thresh 0.007127

Text(0, 0.5, 'Accuracy [%]')

# define the datasets to evaluate each iteration
evalset = [(X_train, y_train), (X_test,y_test)]

DisYBoost_HT.fit(X_train, y_train, eval_set=evalset)
results_post_opt = DisYBoost_HT.evals_result()

[0]	validation_0-logloss:0.65711	validation_1-logloss:0.67096
[1]	validation_0-logloss:0.62612	validation_1-logloss:0.64665
[2]	validation_0-logloss:0.60092	validation_1-logloss:0.63228
[3]	validation_0-logloss:0.57169	validation_1-logloss:0.61495
[4]	validation_0-logloss:0.55864	validation_1-logloss:0.60562
[5]	validation_0-logloss:0.53924	validation_1-logloss:0.59362
[6]	validation_0-logloss:0.52709	validation_1-logloss:0.58429
[7]	validation_0-logloss:0.50875	validation_1-logloss:0.57181
[8]	validation_0-logloss:0.48993	validation_1-logloss:0.56152
[9]	validation_0-logloss:0.47315	validation_1-logloss:0.55230
[10]	validation_0-logloss:0.46676	validation_1-logloss:0.54882
[11]	validation_0-logloss:0.44523	validation_1-logloss:0.53968
[12]	validation_0-logloss:0.43366	validation_1-logloss:0.53301
[13]	validation_0-logloss:0.42457	validation_1-logloss:0.52835
[14]	validation_0-logloss:0.41418	validation_1-logloss:0.52188
[15]	validation_0-logloss:0.39584	validation_1-logloss:0.51404
[16]	validation_0-logloss:0.38407	validation_1-logloss:0.50493
[17]	validation_0-logloss:0.37358	validation_1-logloss:0.49766
[18]	validation_0-logloss:0.36456	validation_1-logloss:0.49342
[19]	validation_0-logloss:0.36035	validation_1-logloss:0.48995
[20]	validation_0-logloss:0.34742	validation_1-logloss:0.48414
[21]	validation_0-logloss:0.34293	validation_1-logloss:0.48226
[22]	validation_0-logloss:0.33685	validation_1-logloss:0.48019
[23]	validation_0-logloss:0.32705	validation_1-logloss:0.47670
[24]	validation_0-logloss:0.32140	validation_1-logloss:0.47518
[25]	validation_0-logloss:0.31546	validation_1-logloss:0.47311
[26]	validation_0-logloss:0.30702	validation_1-logloss:0.47053
[27]	validation_0-logloss:0.29889	validation_1-logloss:0.46980
[28]	validation_0-logloss:0.29740	validation_1-logloss:0.46907
[29]	validation_0-logloss:0.29396	validation_1-logloss:0.46840
[30]	validation_0-logloss:0.28705	validation_1-logloss:0.46532
[31]	validation_0-logloss:0.27508	validation_1-logloss:0.46208
[32]	validation_0-logloss:0.26944	validation_1-logloss:0.46107
[33]	validation_0-logloss:0.26425	validation_1-logloss:0.45961
[34]	validation_0-logloss:0.26196	validation_1-logloss:0.45938
[35]	validation_0-logloss:0.25110	validation_1-logloss:0.45535
[36]	validation_0-logloss:0.24956	validation_1-logloss:0.45441
[37]	validation_0-logloss:0.24065	validation_1-logloss:0.45206
[38]	validation_0-logloss:0.23377	validation_1-logloss:0.44762
[39]	validation_0-logloss:0.22859	validation_1-logloss:0.44559
[40]	validation_0-logloss:0.21966	validation_1-logloss:0.44024
[41]	validation_0-logloss:0.21353	validation_1-logloss:0.43910
[42]	validation_0-logloss:0.21001	validation_1-logloss:0.43854
[43]	validation_0-logloss:0.20818	validation_1-logloss:0.43893
[44]	validation_0-logloss:0.20153	validation_1-logloss:0.43622
[45]	validation_0-logloss:0.20052	validation_1-logloss:0.43684
[46]	validation_0-logloss:0.19647	validation_1-logloss:0.43582
[47]	validation_0-logloss:0.19170	validation_1-logloss:0.43525
[48]	validation_0-logloss:0.18741	validation_1-logloss:0.43359
[49]	validation_0-logloss:0.18590	validation_1-logloss:0.43331
[50]	validation_0-logloss:0.18260	validation_1-logloss:0.43339
[51]	validation_0-logloss:0.17967	validation_1-logloss:0.43268
[52]	validation_0-logloss:0.17728	validation_1-logloss:0.43170
[53]	validation_0-logloss:0.17304	validation_1-logloss:0.42919
[54]	validation_0-logloss:0.17093	validation_1-logloss:0.42708
[55]	validation_0-logloss:0.16997	validation_1-logloss:0.42669
[56]	validation_0-logloss:0.16627	validation_1-logloss:0.42628
[57]	validation_0-logloss:0.16167	validation_1-logloss:0.42433
[58]	validation_0-logloss:0.15953	validation_1-logloss:0.42407
[59]	validation_0-logloss:0.15839	validation_1-logloss:0.42354
[60]	validation_0-logloss:0.15225	validation_1-logloss:0.42166
[61]	validation_0-logloss:0.14822	validation_1-logloss:0.42005
[62]	validation_0-logloss:0.14306	validation_1-logloss:0.41702
[63]	validation_0-logloss:0.14114	validation_1-logloss:0.41711
[64]	validation_0-logloss:0.13643	validation_1-logloss:0.41566
[65]	validation_0-logloss:0.13488	validation_1-logloss:0.41436
[66]	validation_0-logloss:0.13163	validation_1-logloss:0.41401
[67]	validation_0-logloss:0.12769	validation_1-logloss:0.41231
[68]	validation_0-logloss:0.12444	validation_1-logloss:0.41163
[69]	validation_0-logloss:0.12116	validation_1-logloss:0.41135
[70]	validation_0-logloss:0.11841	validation_1-logloss:0.41039
[71]	validation_0-logloss:0.11661	validation_1-logloss:0.40825
[72]	validation_0-logloss:0.11402	validation_1-logloss:0.40555
[73]	validation_0-logloss:0.11273	validation_1-logloss:0.40504
[74]	validation_0-logloss:0.11015	validation_1-logloss:0.40420
[75]	validation_0-logloss:0.10808	validation_1-logloss:0.40273
[76]	validation_0-logloss:0.10675	validation_1-logloss:0.40090
[77]	validation_0-logloss:0.10408	validation_1-logloss:0.39964
[78]	validation_0-logloss:0.10105	validation_1-logloss:0.39933
[79]	validation_0-logloss:0.10031	validation_1-logloss:0.39958
[80]	validation_0-logloss:0.09935	validation_1-logloss:0.39885
[81]	validation_0-logloss:0.09682	validation_1-logloss:0.39800
[82]	validation_0-logloss:0.09611	validation_1-logloss:0.39809
[83]	validation_0-logloss:0.09548	validation_1-logloss:0.39779
[84]	validation_0-logloss:0.09372	validation_1-logloss:0.39635
[85]	validation_0-logloss:0.09164	validation_1-logloss:0.39551
[86]	validation_0-logloss:0.09082	validation_1-logloss:0.39545
[87]	validation_0-logloss:0.08954	validation_1-logloss:0.39493
[88]	validation_0-logloss:0.08771	validation_1-logloss:0.39528
[89]	validation_0-logloss:0.08627	validation_1-logloss:0.39376
[90]	validation_0-logloss:0.08397	validation_1-logloss:0.39384
[91]	validation_0-logloss:0.08270	validation_1-logloss:0.39397
[92]	validation_0-logloss:0.08082	validation_1-logloss:0.39216
[93]	validation_0-logloss:0.07998	validation_1-logloss:0.39073
[94]	validation_0-logloss:0.07803	validation_1-logloss:0.38925
[95]	validation_0-logloss:0.07715	validation_1-logloss:0.38916
[96]	validation_0-logloss:0.07553	validation_1-logloss:0.38832
[97]	validation_0-logloss:0.07446	validation_1-logloss:0.38927
[98]	validation_0-logloss:0.07354	validation_1-logloss:0.38776
[99]	validation_0-logloss:0.07191	validation_1-logloss:0.38834
[100]	validation_0-logloss:0.07095	validation_1-logloss:0.38795
[101]	validation_0-logloss:0.06916	validation_1-logloss:0.38644
[102]	validation_0-logloss:0.06773	validation_1-logloss:0.38565
[103]	validation_0-logloss:0.06685	validation_1-logloss:0.38505
[104]	validation_0-logloss:0.06570	validation_1-logloss:0.38438
[105]	validation_0-logloss:0.06532	validation_1-logloss:0.38452
[106]	validation_0-logloss:0.06428	validation_1-logloss:0.38366
[107]	validation_0-logloss:0.06307	validation_1-logloss:0.38355
[108]	validation_0-logloss:0.06243	validation_1-logloss:0.38325
[109]	validation_0-logloss:0.06142	validation_1-logloss:0.38330
[110]	validation_0-logloss:0.06027	validation_1-logloss:0.38229
[111]	validation_0-logloss:0.05919	validation_1-logloss:0.38195
[112]	validation_0-logloss:0.05825	validation_1-logloss:0.38129
[113]	validation_0-logloss:0.05769	validation_1-logloss:0.38154
[114]	validation_0-logloss:0.05687	validation_1-logloss:0.38222
[115]	validation_0-logloss:0.05640	validation_1-logloss:0.38209
[116]	validation_0-logloss:0.05612	validation_1-logloss:0.38255
[117]	validation_0-logloss:0.05566	validation_1-logloss:0.38251
[118]	validation_0-logloss:0.05484	validation_1-logloss:0.38153
[119]	validation_0-logloss:0.05418	validation_1-logloss:0.38229
[120]	validation_0-logloss:0.05339	validation_1-logloss:0.38241
[121]	validation_0-logloss:0.05285	validation_1-logloss:0.38218
[122]	validation_0-logloss:0.05202	validation_1-logloss:0.38114
[123]	validation_0-logloss:0.05165	validation_1-logloss:0.38152
[124]	validation_0-logloss:0.05075	validation_1-logloss:0.38114
[125]	validation_0-logloss:0.04986	validation_1-logloss:0.38046
[126]	validation_0-logloss:0.04948	validation_1-logloss:0.38106
[127]	validation_0-logloss:0.04926	validation_1-logloss:0.38097
[128]	validation_0-logloss:0.04853	validation_1-logloss:0.37955
[129]	validation_0-logloss:0.04834	validation_1-logloss:0.37992
[130]	validation_0-logloss:0.04780	validation_1-logloss:0.38071
[131]	validation_0-logloss:0.04736	validation_1-logloss:0.38046
[132]	validation_0-logloss:0.04672	validation_1-logloss:0.38112
[133]	validation_0-logloss:0.04628	validation_1-logloss:0.38137
[134]	validation_0-logloss:0.04573	validation_1-logloss:0.38141
[135]	validation_0-logloss:0.04524	validation_1-logloss:0.38027
[136]	validation_0-logloss:0.04457	validation_1-logloss:0.37986
[137]	validation_0-logloss:0.04406	validation_1-logloss:0.37980
[138]	validation_0-logloss:0.04359	validation_1-logloss:0.37882
[139]	validation_0-logloss:0.04312	validation_1-logloss:0.37813
[140]	validation_0-logloss:0.04254	validation_1-logloss:0.37860
[141]	validation_0-logloss:0.04222	validation_1-logloss:0.37817
[142]	validation_0-logloss:0.04170	validation_1-logloss:0.37892
[143]	validation_0-logloss:0.04116	validation_1-logloss:0.37927
[144]	validation_0-logloss:0.04075	validation_1-logloss:0.37964
[145]	validation_0-logloss:0.04016	validation_1-logloss:0.37891
[146]	validation_0-logloss:0.03969	validation_1-logloss:0.37917
[147]	validation_0-logloss:0.03929	validation_1-logloss:0.37892
[148]	validation_0-logloss:0.03910	validation_1-logloss:0.37879
[149]	validation_0-logloss:0.03863	validation_1-logloss:0.37894

plt.figure(figsize=(3.5, 3), dpi=150)
# plot learning curves
plt.plot(results_post_opt['validation_0']['logloss'], label='Train post HT')
plt.plot(results_post_opt['validation_1']['logloss'], label='Test post HT')

plt.plot(results_pre_opt['validation_0']['logloss'], label='Train pre HT')
plt.plot(results_pre_opt['validation_1']['logloss'], label='Test pre HT' )

plt.legend()
plt.show()

Predicting Physician Recommendations¶

Background¶

Instructions¶

Data Detail¶

Load data and EDA¶

Some initial takeaways:¶

Quick look at non-binary predictors¶

Null values¶

Other null values are in the following features:¶

Feature Engineering¶

Check accuracy for individual binary features¶

Check Correlation Matrix¶

Label Encoding¶

Patient gender¶

Baseline model using XGBoost¶

Check Learning Curves¶

Feature importances from the model¶

Hyperparameter Tuning¶

	patientID	age_at_rf_r1_dx	patient_gender	current_severity	severity_at_DY_dx	age_current	DY_dx_age	rf_r1_present	...	previously_tried_px4	previously_tried_px5	previously_tried_px6	recommend_Product_X
424	424	23.0	M	Moderate	Moderate	25	24	1	...	0.0	0.0	1.0	0
844	844	23.0	M	Moderate	Moderate	43	31	1	...	1.0	1.0	0.0	1
229	229	30.0	F	Moderate	Mild	39	36	1	...	0.0	0.0	0.0	0
816	816	58.0	M	Moderate	Severe	66	62	1	...	1.0	1.0	0.0	1
1549	1549	NaN	F	Moderate	Moderate	44	43	0	...	0.0	0.0	0.0	1

	rf_r6_present	recommend_Product_X
rf_r6_present	8.717719	4.837962
recommend_Product_X	4.837962	48.717063