from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive

# Data and math operations
import pandas as pd
import numpy as np
from scipy import stats
import copy

# Data visualizationa
import matplotlib.pyplot as plt
import seaborn as sns

# Model building
from sklearn import preprocessing
import xgboost as xgb
from sklearn.linear_model import LogisticRegression


# Model evaluation and interpretation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,  f1_score, recall_score, precision_score
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.model_selection import KFold,  cross_val_score # for validation of the model
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay



!pip install shap
import shap

# Hyperparameter Tuning
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

Collecting shap
  Downloading shap-0.44.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (533 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 533.5/533.5 kB 5.4 MB/s eta 0:00:00
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap) (1.23.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap) (1.11.4)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap) (1.2.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap) (1.5.3)
Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap) (4.66.1)
Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.10/dist-packages (from shap) (23.2)
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap) (0.58.1)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap) (2.2.1)
Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap) (0.41.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2023.3.post1)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (3.2.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0)
Installing collected packages: slicer, shap
Successfully installed shap-0.44.0 slicer-0.0.7

data_path = '/content/gdrive/MyDrive/0_Career/Purpose Financial/Purpose Financial Data Challenge 2024/'
df = pd.read_csv(data_path + 'train.csv')

df.columns

Index(['id', 'date_of_birth', 'number_dependants', 'credit_utilization',
       'debt_to_income_ratio', 'monthly_income', 'number_open_credit_lines',
       'number_open_loans', 'number_90_days_past_due', 'number_charged_off',
       'score1', 'score2', 'target'],
      dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20839 entries, 0 to 20838
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        20839 non-null  int64  
 1   date_of_birth             20839 non-null  object 
 2   number_dependants         20839 non-null  int64  
 3   credit_utilization        20839 non-null  float64
 4   debt_to_income_ratio      20839 non-null  float64
 5   monthly_income            20839 non-null  int64  
 6   number_open_credit_lines  20839 non-null  int64  
 7   number_open_loans         20839 non-null  int64  
 8   number_90_days_past_due   20839 non-null  int64  
 9   number_charged_off        20839 non-null  int64  
 10  score1                    20839 non-null  int64  
 11  score2                    20839 non-null  int64  
 12  target                    20839 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 2.1+ MB

df.duplicated().sum()

0

df.describe()

df.head()

#unbalanced dataset
df['target'].value_counts(normalize=True)
df['target'].value_counts()

0    18710
1     2129
Name: target, dtype: int64

# convert DOB -> Age [yrs]
df['age'] = (pd.to_datetime('now',utc=True) - pd.to_datetime(df['date_of_birth'],utc=True)).astype('timedelta64[Y]').astype('int')
df[['date_of_birth', 'age']].sort_values('date_of_birth')

bins = np.linspace(20, 200, 19)

fig, ax = plt.subplots(1,2, figsize = (8,3), dpi=100)

ax[0].hist(df['age'],bins=bins, edgecolor='grey', alpha=0.5)
ax[0].set_xlabel('Age [yrs]')
ax[0].set_ylabel('count')

ax[1].set_yscale('log')
ax[1].hist(df['age'],bins=bins, edgecolor='grey', alpha=0.5)
ax[1].set_xlabel('Age [yrs]')
ax[1].set_ylabel('log(count)')

plt.tight_layout()

age_cut = 80

bins = np.arange(age_cut, 200, 1)
bins2 = np.arange(age_cut, 100, 1)

fig, ax = plt.subplots(1,2, figsize = (8,3), dpi=100)

ax[0].set_yscale('log')
ax[0].hist(df[df['age']>=age_cut]['age'],bins=bins, edgecolor='grey', alpha=0.5)
ax[0].set_xlabel('Age [yrs]')
ax[0].set_ylabel('count')
ax[0].set_ylabel('log(count)')
ax[0].set_ylim(0.8)

ax[1].set_yscale('log')
ax[1].hist(df[df['age']>=age_cut]['age'],bins=bins2, edgecolor='grey', alpha=0.5)
ax[1].set_xlabel('Age [yrs]')
ax[1].set_ylabel('log(count)')
ax[1].set_xlim(age_cut)
ax[1].set_ylim(0.8)

plt.tight_layout()

%%time
sns.pairplot(df.drop(columns = ['id']), hue='target', corner=True)

Output hidden; open in https://colab.research.google.com to view.

plt.scatter(df['score1'][df['target']==0], df['score2'][df['target']==0], s=5, alpha=0.5, label = 'No Default (0)')
plt.scatter(df['score1'][df['target']==1], df['score2'][df['target']==1], s=5, alpha=0.5, label = 'Default (1)')
plt.legend()
plt.xlabel('score1')
plt.ylabel('score2')

Text(0, 0.5, 'score2')

fig, ax = plt.subplots(1,2, figsize=(8,3))
ax[0].scatter(df['monthly_income'][df['target']==0], df['score1'][df['target']==0], s=5, alpha=0.5, label='No Default (0)')
ax[0].scatter(df['monthly_income'][df['target']==1], df['score1'][df['target']==1], s=5, alpha=0.5, label='Default (1)')

ax[0].set_xlabel('monthly_income')
ax[0].set_ylabel('score1')


ax[1].scatter(df['monthly_income'][df['target']==0], df['score2'][df['target']==0], s=5, alpha=0.5, label='No Default (0)')
ax[1].scatter(df['monthly_income'][df['target']==1], df['score2'][df['target']==1], s=5, alpha=0.5, label='Default (1)')
ax[1].set_xlabel('monthly_income')
ax[1].set_ylabel('score2')

Text(0, 0.5, 'score2')

corr = df.corr()

trimask = np.triu(np.ones_like(corr, dtype=bool))

# Create a heatmap to visualize how correlated variables are
plt.figure(figsize=(8, 6))
ax = plt.gca()

sns.heatmap(corr,
    annot=True,
    cmap="crest", mask=trimask,ax=ax)

# this block will only add annotations for correlations with magnitude >0.1
for t in ax.texts:
    if abs(float(t.get_text()))>=0.1:
        t.set_text(t.get_text())
    else:
        t.set_text("")

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

# columns with discrete values
discrete_cols = [ 'number_dependants', 'number_open_credit_lines',
       'number_open_loans', 'number_90_days_past_due', 'number_charged_off']


target_var='target'

annotate=True
data=df
for col in discrete_cols:
    fig, ax = plt.subplots(1, 1, figsize=(8,6))
    ax.set_xscale('log')
    gb = df.groupby([col, target_var])[target_var]
    gb.count().unstack().plot(kind='barh', legend=True, ax=ax)
    for c, container in enumerate(ax.containers):

        for i, p in enumerate(container):
            group_total = ax.containers[0][i].get_width() + ax.containers[1][i].get_width()

            width = p.get_width()
            height = p.get_height()
            x, y = p.get_xy()

            if annotate:
                ax.annotate('{:.1f}%'.format(100*width/group_total), (x +width + 5, i + (c-1)*0.25  + 0.05), ha='left')

    print('\n')

#list of contniuous numerical columns
continuous_cols = [  'age', 'credit_utilization',
       'debt_to_income_ratio', 'monthly_income',
       'score1', 'score2']

target_var='target'


data=df
for col in continuous_cols:
    fig, ax = plt.subplots(1, 1, figsize=(4,3))


    bins = np.linspace(np.nanmin(data[col]), np.nanmax(data[col]), 30)

    ax.hist(data[data[target_var]==0][col], label='No Default (0)', bins=bins, alpha=0.5, edgecolor='grey', density=True)
    ax.hist(data[data[target_var]==1][col], label='Default (1)', bins=bins, alpha=0.5, edgecolor='grey', density=True)
    ax.legend()
    ax.set_xlabel(col)
    ax.set_ylabel('Normalized')

# A simple average of the two score values


data=copy.deepcopy(df) # make a copy so that we dont affect our actual data

data['score_average'] = (data['score1'] + data['score2'])/2
col = 'score_average'
fig, ax = plt.subplots(1, 1, figsize=(4,3))


df1 = data[data[target_var]==1][col]
df0 = data[data[target_var]==0][col]

bins = np.linspace(np.nanmin(data[col]), np.nanmax(data[col]), 30)

hist2 = ax.hist(df0, label='No Default (0)', bins=bins, alpha=0.5, edgecolor='grey', density=True)[0]
hist1 = ax.hist(df1, label='Default (1)', bins=bins, alpha=0.5, edgecolor='grey', density=True)[0]
ax.legend()
ax.set_xlabel(col)
ax.set_ylabel('Normalized')

# compare normalized distributions with Kolmogorov-Smirnov 2 sample test
# Note this is only used for comparing how discriminative differnt feature
# may be relative to one another (e.g. score_average vs score_income_multiplier)
# It is not being used for intepreting the actual staistical significances
stats.ks_2samp(hist1, hist2)

KstestResult(statistic=0.20689655172413793, pvalue=0.5721774392909802, statistic_location=0.000646714231755061, statistic_sign=1)

''' Here we try multiplying the two categories (score and income) that look to potentially
 have some predictive power, and normalize by the mean of each feature.'''

data=copy.deepcopy(df) # make a copy so that we dont affect our actual data

data['score_average'] = (data['score1'] + data['score2'])/2
data['score_income_multiplier'] = data['score_average']/data['score_average'].mean() * data['monthly_income']/data['monthly_income'].mean()

col = 'score_income_multiplier'
fig, ax = plt.subplots(1, 1, figsize=(4,3))


df1 = data[data[target_var]==1][col]
df0 = data[data[target_var]==0][col]


bins = np.linspace(np.nanmin(data[col]), np.nanmax(data[col]), 30)

hist2 = ax.hist(df0, label='No Default (0)', bins=bins, alpha=0.5, edgecolor='grey', density=True)[0]
hist1 = ax.hist(df1, label='Default (1)', bins=bins, alpha=0.5, edgecolor='grey', density=True)[0]
ax.legend()
ax.set_xlabel(col)
ax.set_ylabel('Normalized')


# compare normalized distributions with Kolmogorov-Smirnov 2 sample test
stats.ks_2samp(hist1, hist2)

KstestResult(statistic=0.4827586206896552, pvalue=0.0019790709048350735, statistic_location=0.0, statistic_sign=1)

def data_preprocess(df):

    '''
        Perform data preprocessing on an input dataframe.
        Checks for null and duplicates, adds new features,
        removes records with ages >95 and sets negative_dependants values to 0.
        No columns are removed in this stage (e.g. date_of_birth)
    '''

    # Check nulls and duplicates
    num_nulls = df.isnull().sum().sum()
    num_duplicates = df.duplicated().sum()
    if num_nulls != 0:
        print("Warning: {} null values".format(num_nulls))

    if num_duplicates != 0:
        print("Warning: {} duplicate rows".format(num_duplicates))

    # new features
    df['age'] = (pd.to_datetime('now',utc=True) - pd.to_datetime(df['date_of_birth'],utc=True)).astype('timedelta64[Y]').astype('int')
    df['score_average'] = (df['score1'] + df['score2'])/2
    # product of mean normalized score average and monthly income
    df['score_income_multiplier'] = df['score_average']/df['score_average'].mean() * df['monthly_income']/df['monthly_income'].mean()

    # Remove records with Age above designated upper limit
    if len(df[df['age']>95]) != 0:
        print('{} records exceeds upper age limit'.format(len(df[df['age']>95])))
    df = df[df['age']<=95]

    # we do not have a lower age cutoff, but it would be good to get a warning
    # if we have some anomalous lower age values,
    # esp. e.g. something like 5 years that would not make sense
    if len(df[df['age']<24]) != 0:
        print('{} records exceeds lower age limit (min {})'.format(len(df[df['age']<24]), df[df['age']<24]['age'].min()))

    # Set negative dpendant values to 0
    if len(df[df['number_dependants']<0] != 0):
        print('{} rows with negative dependants, setting equal to 0 (min {})'.format(len(df[df['number_dependants']<0] != 0), df['number_dependants'].min()))
    df.loc[df['number_dependants']==-1, 'number_dependants'] = 0

    return df

df = pd.read_csv(data_path + 'train.csv')

df = data_preprocess(df)

7 records exceeds upper age limit
209 rows with negative dependants, setting equal to 0 (min -1)

new_feats = []

corr = df.corr()

trimask = np.triu(np.ones_like(corr, dtype=bool))

# Create a heatmap to visualize how correlated variables are
plt.figure(figsize=(8, 6))
ax = plt.gca()
sns.heatmap(corr,
    annot=True,
    cmap="crest", mask=trimask,ax=ax)


for t in ax.texts:
    if abs(float(t.get_text()))>=0.1:
        t.set_text(t.get_text()) #if the value is greater than 0.4 then I set the text
    else:
        t.set_text("") # if not it sets an empty text

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

### Train-test-split

def get_train_test_dict(df, target_var,id_col=None, cols=[], drop=True):
    '''
        utility to easily get the train and test datasets stored in a dictionary.
        columns chosen by cols can be kept or dropped. If an id col is included
        the values of that column will also be included in the dictionary
    '''
    fit_df = copy.deepcopy(df)



    if id_col != None:
        try:
            cols.remove(id_col)
            print('remove id_col')
        except:
            print('id col not in cols')


    if drop:
        if target_var not in cols:
            cols.append(target_var)
        X, y = fit_df.drop(columns=drop_train_cols), fit_df[target_var]
    else:
        if id_col !=None:
            X, y = fit_df[cols+[id_col]], fit_df[target_var]
        else:
            X, y = fit_df[cols], fit_df[target_var]


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    if id_col!=None:
        train_id = X_train[id_col].to_list()
        test_id = X_test[id_col].to_list()

        X_train.drop(columns=[id_col], inplace=True)
        X_test.drop(columns=[id_col], inplace=True)
    else:
        train_id = 'No id column provided'
        test_id = 'No id column provided'




    evalset = [(X_train, y_train), (X_test,y_test)]

    tts_dict = {'X':X,
                'y':y,
                'X_train':X_train,
                'y_train':y_train,
                'X_test':X_test,
                'y_test':y_test,
                'eval_set':evalset,
                'train_ids':train_id,
                'test_ids':test_id}

    return tts_dict

seed=0
target_var  =  'target'

# These columns will be dropped from the training data
keep_train_cols = ['monthly_income', 'score_average']



lr_base_dict = get_train_test_dict(df, target_var, id_col = 'id', cols=keep_train_cols, drop=False)
lr_base_dict['X_train']

id col not in cols

lr_base_model = LogisticRegression(random_state=0,  class_weight = 'balanced')

print('features:', lr_base_dict['X_train'].columns.to_list())

# scale training data
scaler = preprocessing.StandardScaler().fit(lr_base_dict['X_train'])

X_train_scaled = pd.DataFrame(scaler.transform(lr_base_dict['X_train']), columns=lr_base_dict['X_train'].columns)
X_test_scaled = pd.DataFrame(scaler.transform(lr_base_dict['X_test']), columns=lr_base_dict['X_test'].columns)


# Fit (scaled) train data
lr_base_model.fit(X_train_scaled, lr_base_dict['y_train'])

# make predictions for (scaled) test data
lr_base_y_pred = lr_base_model.predict(X_test_scaled)
lr_base_y_proba = lr_base_model.predict_proba(X_test_scaled)


# evaluate predictions
pr_fpr, pr_tpr, pr_thresholds = precision_recall_curve(lr_base_dict['y_test'], lr_base_y_proba[:,1])
auc_pr = auc(pr_tpr, pr_fpr)


roc_fpr, roc_tpr, roc_thresholds = roc_curve(lr_base_dict['y_test'], lr_base_y_proba[:,1])
roc_auc = roc_auc_score(lr_base_dict['y_test'], lr_base_y_pred)
f1 = f1_score(lr_base_dict['y_test'], lr_base_y_pred)
recall = recall_score(lr_base_dict['y_test'], lr_base_y_pred)
precision = precision_score(lr_base_dict['y_test'], lr_base_y_pred)


print("roc_auc: %.2f%%" % (roc_auc * 100.0))
print("auc_pr: %.2f%%" % (auc_pr * 100.0))
print("Recall: {:.2f}%".format(recall * 100.0))
print("Precision: {:.2f}%".format(precision * 100.0))
print("F1: {:.2f}%".format(f1 * 100.0))

print(classification_report(lr_base_dict['y_test'], lr_base_y_pred))

features: ['monthly_income', 'score_average']
roc_auc: 68.43%
auc_pr: 23.14%
Recall: 72.05%
Precision: 19.04%
F1: 30.12%
              precision    recall  f1-score   support

           0       0.95      0.65      0.77      5606
           1       0.19      0.72      0.30       644

    accuracy                           0.66      6250
   macro avg       0.57      0.68      0.54      6250
weighted avg       0.87      0.66      0.72      6250

kfold = KFold(n_splits=10, shuffle=True,random_state=seed)
results = cross_val_score(lr_base_model, X_train_scaled, lr_base_dict['y_train'],scoring='f1', cv=kfold)
print("K-fold f1 score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(lr_base_model, X_train_scaled, lr_base_dict['y_train'],scoring='recall', cv=kfold)
print("K-fold recall score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(lr_base_model, X_train_scaled, lr_base_dict['y_train'],scoring='precision', cv=kfold)
print("K-fold precision score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold f1 score: 30.46% (2.63%)
K-fold recall score: 72.41% (4.20%)
K-fold precision score: 19.32% (1.96%)

idx = np.argmin(abs(pr_thresholds - 0.5))

plt.plot(pr_tpr, pr_fpr)

plt.axvline(pr_tpr[idx], ls=':', color='k', label='Probability Threshold')
plt.axhline(pr_fpr[idx], ls=':', color='k')
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('Precision-Recall Curve')

Text(0.5, 1.0, 'Precision-Recall Curve')

plt.figure(figsize=(4,3), dpi=150)
ax = plt.gca()
# Compute values for confusion matrix
log_cm = confusion_matrix(lr_base_dict['y_test'], lr_base_y_pred)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=['No Default', 'Default'])
log_disp.plot(cmap='Blues',ax=ax)

ax.set_title('XGBoost (base)')

print(classification_report(lr_base_dict['y_test'], lr_base_y_pred))

              precision    recall  f1-score   support

           0       0.95      0.65      0.77      5606
           1       0.19      0.72      0.30       644

    accuracy                           0.66      6250
   macro avg       0.57      0.68      0.54      6250
weighted avg       0.87      0.66      0.72      6250

coeff_dict = dict(zip(lr_base_dict['X_train'].columns, list((lr_base_model.coef_[0]))))
coeff_dict =  dict(sorted(coeff_dict.items(), key=lambda item: item[1]))

for key in coeff_dict.keys():
    coeff = coeff_dict[key]

    odds_ratio = np.exp(coeff)
    print( '{} coeff: {:.2f} odds ratio: {:.1f}' .format(key, coeff_dict[key], odds_ratio))

monthly_income coeff: -0.97 odds ratio: 0.4
score_average coeff: -0.75 odds ratio: 0.5

explainer = shap.Explainer(lr_base_model, lr_base_dict['X_train'])
shap_values = explainer(lr_base_dict['X_test'])

shap.plots.beeswarm(shap_values, show=False)

plt.title('LogReg')

Text(0.5, 1.0, 'LogReg')

review_test = copy.deepcopy(lr_base_dict['X_test'])

review_test['id'] = lr_base_dict['test_ids']
review_test['prob'] = lr_base_y_proba[:, 1]
review_test['pred'] = lr_base_y_pred

col = 'monthly_income'

df1 = review_test[review_test['pred']==1]
df0 = review_test[review_test['pred']==0]

plt.scatter(df0['monthly_income'], df0['score_average'], s=5, alpha=0.5, label='No Pred. Default (0)')
plt.scatter(df1['monthly_income'], df1['score_average'], s=5, alpha=0.5, label='Pred. Default (1)')



plt.tricontour(review_test['monthly_income'], review_test['score_average'], review_test['prob'], [0,0.5], linewidths=3, colors='k')
plt.plot(np.nan, np.nan, color='k', label='Decision Boundary - Base')
plt.legend()
plt.xlabel('Monthly Income')
plt.ylabel('Average Score')

Text(0, 0.5, 'Average Score')

f1_scores = []
precision_scores = []
recall_scores = []

eta_vals = np.linspace(0.5, 0.95, 20)

for eta in eta_vals:
    print(eta)
    class_weight = {0:1-eta, 1:eta}
    lr_eta_model = LogisticRegression(random_state=0,  class_weight = class_weight)
    # Fit (scaled) train data
    lr_eta_model.fit(X_train_scaled, lr_base_dict['y_train'])

    # make predictions for (scaled) test data
    lr_eta_y_pred = lr_eta_model.predict(X_test_scaled)
    lr_eta_y_proba = lr_eta_model.predict_proba(X_test_scaled)

    # evaluate predictions
    f1 = f1_score(lr_base_dict['y_test'], lr_eta_y_pred)
    recall = recall_score(lr_base_dict['y_test'], lr_eta_y_pred)
    precision = precision_score(lr_base_dict['y_test'], lr_eta_y_pred)


    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

0.5
0.5236842105263158
0.5473684210526316
0.5710526315789474
0.5947368421052631

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

0.618421052631579
0.6421052631578947
0.6657894736842105
0.6894736842105262
0.7131578947368421
0.7368421052631579
0.7605263157894737
0.7842105263157895
0.8078947368421052
0.831578947368421
0.8552631578947367
0.8789473684210526
0.9026315789473685
0.9263157894736842
0.95

plt.plot(eta_vals/(1-eta_vals), f1_scores, label='f1')
plt.plot(eta_vals/(1-eta_vals), precision_scores, label='precision')
plt.plot(eta_vals/(1-eta_vals), recall_scores, label='recall')
plt.axvline(9, ls=":", color='k', label='default or "balanced" value')
plt.legend()
plt.gca().set_yticks(np.linspace(0,1,11))
plt.xlabel('Positive class weight scale')

Text(0.5, 0, 'Positive class weight scale')

drop_train_cols = ['date_of_birth', 'id', 'score1', 'score2', 'target',
       'score_income_multiplier']
lr_feat_dict = get_train_test_dict(df, target_var, id_col = 'id', cols=drop_train_cols, drop=True)
lr_feat_dict['X_train'].columns

remove id_col

Index(['number_dependants', 'credit_utilization', 'debt_to_income_ratio',
       'monthly_income', 'number_open_credit_lines', 'number_open_loans',
       'number_90_days_past_due', 'number_charged_off', 'age',
       'score_average'],
      dtype='object')

lr_feat_model = LogisticRegression(random_state=0,  class_weight = 'balanced')

print('features included:', lr_feat_dict['X_train'].columns.to_list())

# scale training data
scaler = preprocessing.StandardScaler().fit(lr_feat_dict['X_train'])

X_train_scaled = pd.DataFrame(scaler.transform(lr_feat_dict['X_train']), columns=lr_feat_dict['X_train'].columns)
X_test_scaled = pd.DataFrame(scaler.transform(lr_feat_dict['X_test']), columns=lr_feat_dict['X_test'].columns)


# Fit (scaled) train data
lr_feat_model.fit(X_train_scaled, lr_feat_dict['y_train'])

# make predictions for (scaled) test data
lr_feat_y_pred = lr_feat_model.predict(X_test_scaled)
lr_feat_y_proba = lr_feat_model.predict_proba(X_test_scaled)


# evaluate predictions
pr_fpr, pr_tpr, pr_thresholds = precision_recall_curve(lr_feat_dict['y_test'], lr_feat_y_proba[:,1])
auc_pr = auc(pr_tpr, pr_fpr)

roc_fpr, roc_tpr, roc_thresholds = roc_curve(lr_feat_dict['y_test'], lr_feat_y_proba[:,1])
roc_auc = roc_auc_score(lr_feat_dict['y_test'], lr_feat_y_pred)
f1 = f1_score(lr_feat_dict['y_test'], lr_feat_y_pred)
recall = recall_score(lr_feat_dict['y_test'], lr_feat_y_pred)
precision = precision_score(lr_feat_dict['y_test'], lr_feat_y_pred)


print("roc_auc: %.2f%%" % (roc_auc * 100.0))
print("auc_pr: %.2f%%" % (auc_pr * 100.0))
print("Recall: {:.2f}%".format(recall * 100.0))
print("Precision: {:.2f}%".format(precision * 100.0))
print("F1: {:.2f}%".format(f1 * 100.0))

print(classification_report(lr_feat_dict['y_test'], lr_feat_y_pred))

features included: ['number_dependants', 'credit_utilization', 'debt_to_income_ratio', 'monthly_income', 'number_open_credit_lines', 'number_open_loans', 'number_90_days_past_due', 'number_charged_off', 'age', 'score_average']
roc_auc: 68.45%
auc_pr: 23.00%
Recall: 72.05%
Precision: 19.06%
F1: 30.15%
              precision    recall  f1-score   support

           0       0.95      0.65      0.77      5606
           1       0.19      0.72      0.30       644

    accuracy                           0.66      6250
   macro avg       0.57      0.68      0.54      6250
weighted avg       0.87      0.66      0.72      6250

kfold = KFold(n_splits=10, shuffle=True,random_state=seed)
results = cross_val_score(lr_feat_model, X_train_scaled, lr_feat_dict['y_train'],scoring='f1', cv=kfold)
print("K-fold f1 score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(lr_feat_model, X_train_scaled, lr_feat_dict['y_train'],scoring='recall', cv=kfold)
print("K-fold recall score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(lr_feat_model, X_train_scaled, lr_feat_dict['y_train'],scoring='precision', cv=kfold)
print("K-fold precision score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold f1 score: 30.44% (2.50%)
K-fold recall score: 72.30% (4.36%)
K-fold precision score: 19.32% (1.86%)

coeff_dict = dict(zip(lr_feat_dict['X_train'].columns, list((lr_feat_model.coef_[0]))))
coeff_dict =  dict(sorted(coeff_dict.items(), key=lambda item: item[1]))

for key in coeff_dict.keys():
    coeff = coeff_dict[key]

    #TODO fix Calculation
    odds = np.exp(coeff)
    print( '{} coeff: {:.2f} odds ratio: {:.2f}' .format(key, coeff_dict[key],odds))

monthly_income coeff: -0.98 odds ratio: 0.38
score_average coeff: -0.75 odds ratio: 0.47
credit_utilization coeff: -0.04 odds ratio: 0.96
number_open_loans coeff: -0.03 odds ratio: 0.97
number_charged_off coeff: -0.03 odds ratio: 0.97
number_90_days_past_due coeff: -0.02 odds ratio: 0.98
number_dependants coeff: 0.00 odds ratio: 1.00
age coeff: 0.01 odds ratio: 1.01
debt_to_income_ratio coeff: 0.02 odds ratio: 1.02
number_open_credit_lines coeff: 0.04 odds ratio: 1.04

review_test_feat = copy.deepcopy(lr_feat_dict['X_test'])

review_test_feat['id'] = lr_feat_dict['test_ids']
review_test_feat['prob'] = lr_feat_y_proba[:, 1]
review_test_feat['pred'] = lr_feat_y_pred

col = 'monthly_income'

df1 = review_test_feat[review_test_feat['pred']==1]
df0 = review_test_feat[review_test_feat['pred']==0]



plt.scatter(df0['monthly_income'], df0['score_average'], s=5, alpha=0.5, label='Pred. No Default (0)')
plt.scatter(df1['monthly_income'], df1['score_average'], s=5, alpha=0.5, label='Pred. Default (1)')

plt.tricontour(review_test_feat['monthly_income'], review_test_feat['score_average'], review_test_feat['prob'], [0,0.5], linewidths=1.5, colors='k')
plt.tricontour(review_test['monthly_income'], review_test['score_average'], review_test['prob'], [0,0.5], linewidths=1.5, colors='r')

plt.plot(np.nan, np.nan, color='r', label='Decision Bound - Base')
plt.plot(np.nan, np.nan, color='k', label='Decision Bound - With add. features')

plt.legend()
plt.xlabel('Monthly Income')
plt.ylabel('Average Score')

Text(0, 0.5, 'Average Score')

explainer = shap.Explainer(lr_feat_model, lr_feat_dict['X_train'])
shap_values = explainer(lr_feat_dict['X_test'])

shap.plots.beeswarm(shap_values, show=False)

plt.title('LogReg')

Text(0.5, 1.0, 'LogReg')

seed=0
target_var  =  'target'

drop_train_cols = ['date_of_birth', 'id']
drop_train_cols = ['date_of_birth', 'id', 'score1', 'score2', 'target', 'score_income_multiplier']
xgb_dict = get_train_test_dict(df, target_var, id_col = 'id', cols=drop_train_cols, drop=True)
#keep_train_cols = ['monthly_income', 'score_average']
#xgb_dict = get_train_test_dict(df, target_var, id_col = 'id', cols=keep_train_cols, drop=False)
xgb_dict['X_train']

remove id_col

# Here we will define our first baseline version of the model, using mainly default parameters
# scale_pos_weight is modified to account for class imbalance

scale_pos_weight = 9 #since there is a ~9/1 ratio of majority - minority records
seed = 0

xgb_model = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight, seed=seed,random_state=seed, eval_metric='aucpr')

xgb_model.fit(xgb_dict['X_train'], xgb_dict['y_train'], eval_set=xgb_dict['eval_set'], verbose=False)


# make predictions for test data
xgb_y_pred = xgb_model.predict(xgb_dict['X_test'])


# evaluate predictions
fpr, tpr, thresholds = precision_recall_curve(xgb_dict['y_test'], xgb_y_pred, pos_label=1)
auc_pr = auc(fpr, tpr)
roc_auc = roc_auc_score(xgb_dict['y_test'], xgb_y_pred)
f1= f1_score(xgb_dict['y_test'], xgb_y_pred)
recall= recall_score(xgb_dict['y_test'], xgb_y_pred)
precision= precision_score(xgb_dict['y_test'], xgb_y_pred)

print("roc_auc: %.2f%%" % (roc_auc * 100.0))
print("auc_pr: %.2f%%" % (auc_pr * 100.0))
print("Recall: {:.2f}%".format(recall * 100.0))
print("Precision: {:.2f}%".format(precision * 100.0))
print("F1: {:.2f}%".format(f1 * 100.0))

roc_auc: 60.41%
auc_pr: 22.19%
Recall: 38.82%
Precision: 19.86%
F1: 26.27%

kfold = KFold(n_splits=10, shuffle=True,random_state=0)
results = cross_val_score(xgb_model, xgb_dict['X_train'], xgb_dict['y_train'],scoring='f1', cv=kfold)
print("K-fold f1 score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))
results = cross_val_score(xgb_model, xgb_dict['X_train'], xgb_dict['y_train'],scoring='recall', cv=kfold)
print("K-fold recall score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))
results = cross_val_score(xgb_model, xgb_dict['X_train'], xgb_dict['y_train'],scoring='precision', cv=kfold)
print("K-fold precision score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold f1 score: 24.58% (2.15%)
K-fold recall score: 34.82% (3.19%)
K-fold precision score: 19.10% (2.17%)

plt.figure(figsize=(4,3), dpi=150)
ax = plt.gca()
# Compute values for confusion matrix
log_cm = confusion_matrix(xgb_dict['y_test'], xgb_y_pred)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=['No Default', 'Default'])
log_disp.plot(cmap='Blues',ax=ax)

ax.set_title('XGBoost (base)')

print(classification_report(xgb_dict['y_test'], xgb_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      5606
           1       0.20      0.39      0.26       644

    accuracy                           0.78      6250
   macro avg       0.56      0.60      0.57      6250
weighted avg       0.85      0.78      0.81      6250

#Here we can check learning cruves for our model using aucpr
results_pre_opt = xgb_model.evals_result()

plt.figure(figsize=(3.5, 3), dpi=150)
# plot learning curves
plt.plot(results_pre_opt['validation_0']['aucpr'], label='Train')
plt.plot(results_pre_opt['validation_1']['aucpr'], label='Test')
plt.ylabel('AUCPR')
plt.xlabel('estimators')
plt.legend()
plt.title('XGBoost (base)')

Text(0.5, 1.0, 'XGBoost (base)')

hyp_params = {
    'max_depth': hp.choice('max_depth', np.arange(1, 10, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0.1),
    'n_estimators': hp.choice('n_estimators', np.arange(10, 300, dtype=int)),
    #'scale_pos_weight' : hp.uniform('scale_pos_weight', 5, 12)
}

# An objective function is needed for the hyperopt tuning
def objective(hyp_params):
    clf=xgb.XGBClassifier(scale_pos_weight=9, seed=seed,random_state=seed,eval_metric='aucpr', **hyp_params)

    evaluation = xgb_dict['eval_set']

    clf.fit(xgb_dict['X_train'], xgb_dict['y_train'],
            eval_set=evaluation,  verbose=False)

    pred = clf.predict(xgb_dict['X_test'])

    recall_clf = recall_score(xgb_dict['y_test'], pred)
    print("Recall SCORE: {:.1f}%".format( recall_clf*100))
    print(hyp_params, '\n\n')

    return {'loss': -recall_clf, 'status': STATUS_OK }

#best_params = {'learning_rate': 0.32613322505015735,
# 'max_depth': 1,
# 'n_estimators': 43,
# 'scale_pos_weight': 11.999695710217374}

%%time

print("finding best hyperparams")
trials = Trials()
best_params = fmin(objective, hyp_params, algo=tpe.suggest, max_evals=100, trials=trials, return_argmin=False, rstate = np.random.default_rng(seed))
print("Best set of hyperparameters: ", best_params)

finding best hyperparams
Recall SCORE: 57.8%
{'learning_rate': 0.08954030922241034, 'max_depth': 7, 'n_estimators': 113}
Recall SCORE: 31.8%
{'learning_rate': 0.11704553756518427, 'max_depth': 8, 'n_estimators': 154}
Recall SCORE: 48.0%
{'learning_rate': 0.4398535736694678, 'max_depth': 4, 'n_estimators': 176}
Recall SCORE: 51.6%
{'learning_rate': 0.13215139914816137, 'max_depth': 9, 'n_estimators': 45}
Recall SCORE: 45.7%
{'learning_rate': 0.8800853037538393, 'max_depth': 5, 'n_estimators': 41}
Recall SCORE: 18.0%
{'learning_rate': 0.3432145749034329, 'max_depth': 9, 'n_estimators': 85}
Recall SCORE: 13.7%
{'learning_rate': 1.072460458201217, 'max_depth': 7, 'n_estimators': 272}
Recall SCORE: 77.3%
{'learning_rate': 0.055291387197299015, 'max_depth': 2, 'n_estimators': 99}
Recall SCORE: 75.8%
{'learning_rate': 0.06771540446456838, 'max_depth': 5, 'n_estimators': 12}
Recall SCORE: 59.3%
{'learning_rate': 0.23193114400056974, 'max_depth': 4, 'n_estimators': 199}
Recall SCORE: 58.1%
{'learning_rate': 0.224544326990115, 'max_depth': 4, 'n_estimators': 219}
Recall SCORE: 38.2%
{'learning_rate': 0.27871592970141734, 'max_depth': 6, 'n_estimators': 131}
Recall SCORE: 16.5%
{'learning_rate': 0.8891142388606061, 'max_depth': 6, 'n_estimators': 168}
Recall SCORE: 12.4%
{'learning_rate': 0.6018322496159735, 'max_depth': 8, 'n_estimators': 205}
Recall SCORE: 41.6%
{'learning_rate': 0.9488853525802273, 'max_depth': 6, 'n_estimators': 29}
Recall SCORE: 19.6%
{'learning_rate': 0.3381862694167717, 'max_depth': 6, 'n_estimators': 278}
Recall SCORE: 77.8%
{'learning_rate': 0.1677349201366436, 'max_depth': 1, 'n_estimators': 274}
Recall SCORE: 70.2%
{'learning_rate': 0.1314695048034014, 'max_depth': 3, 'n_estimators': 261}
Recall SCORE: 11.8%
{'learning_rate': 0.6741064829190943, 'max_depth': 8, 'n_estimators': 128}
Recall SCORE: 65.2%
{'learning_rate': 0.23291803661128407, 'max_depth': 5, 'n_estimators': 72}
Recall SCORE: 78.4%
{'learning_rate': 0.050119746310796305, 'max_depth': 1, 'n_estimators': 257}
Recall SCORE: 78.6%
{'learning_rate': 0.1619075519001385, 'max_depth': 1, 'n_estimators': 196}
Recall SCORE: 76.6%
{'learning_rate': 0.06716659012655338, 'max_depth': 1, 'n_estimators': 37}
Recall SCORE: 76.1%
{'learning_rate': 0.09238241043022498, 'max_depth': 1, 'n_estimators': 32}
Recall SCORE: 78.9%
{'learning_rate': 0.05036491174387498, 'max_depth': 1, 'n_estimators': 223}
Recall SCORE: 78.3%
{'learning_rate': 0.1717500799093038, 'max_depth': 1, 'n_estimators': 102}
Recall SCORE: 76.1%
{'learning_rate': 0.08992538634790602, 'max_depth': 1, 'n_estimators': 26}
Recall SCORE: 70.3%
{'learning_rate': 0.1726825199799033, 'max_depth': 3, 'n_estimators': 227}
Recall SCORE: 76.6%
{'learning_rate': 0.07330778286848053, 'max_depth': 2, 'n_estimators': 235}
Recall SCORE: 77.0%
{'learning_rate': 0.4858754678813139, 'max_depth': 1, 'n_estimators': 223}
Recall SCORE: 41.1%
{'learning_rate': 0.10471101305374897, 'max_depth': 7, 'n_estimators': 193}
Recall SCORE: 79.3%
{'learning_rate': 0.05511151048748964, 'max_depth': 1, 'n_estimators': 118}
Recall SCORE: 53.0%
{'learning_rate': 0.056578585666456814, 'max_depth': 9, 'n_estimators': 104}
Recall SCORE: 76.1%
{'learning_rate': 0.07669639530985467, 'max_depth': 1, 'n_estimators': 49}
Recall SCORE: 75.2%
{'learning_rate': 0.11402162206574153, 'max_depth': 3, 'n_estimators': 106}
Recall SCORE: 71.9%
{'learning_rate': 0.057811139410763046, 'max_depth': 7, 'n_estimators': 20}
Recall SCORE: 77.0%
{'learning_rate': 0.05161102657765, 'max_depth': 2, 'n_estimators': 158}
Recall SCORE: 50.6%
{'learning_rate': 0.08287589684378145, 'max_depth': 9, 'n_estimators': 79}
Recall SCORE: 29.0%
{'learning_rate': 0.13850297666382724, 'max_depth': 8, 'n_estimators': 151}
Recall SCORE: 71.9%
{'learning_rate': 0.06250842867256784, 'max_depth': 4, 'n_estimators': 220}
Recall SCORE: 66.9%
{'learning_rate': 0.0992031291882716, 'max_depth': 5, 'n_estimators': 171}
Recall SCORE: 76.1%
{'learning_rate': 0.050702949022295205, 'max_depth': 1, 'n_estimators': 83}
Recall SCORE: 73.8%
{'learning_rate': 0.20018765085349371, 'max_depth': 2, 'n_estimators': 286}
Recall SCORE: 30.4%
{'learning_rate': 0.2894645516816557, 'max_depth': 7, 'n_estimators': 84}
Recall SCORE: 50.2%
{'learning_rate': 0.43947996610723133, 'max_depth': 4, 'n_estimators': 139}
Recall SCORE: 55.6%
{'learning_rate': 0.07930913526234064, 'max_depth': 9, 'n_estimators': 63}
Recall SCORE: 78.6%
{'learning_rate': 0.12362318143924059, 'max_depth': 1, 'n_estimators': 68}
Recall SCORE: 73.6%
{'learning_rate': 0.06790531460938662, 'max_depth': 5, 'n_estimators': 81}
Recall SCORE: 35.1%
{'learning_rate': 0.1402468872109766, 'max_depth': 8, 'n_estimators': 119}
Recall SCORE: 37.9%
{'learning_rate': 0.28200649479546175, 'max_depth': 6, 'n_estimators': 118}
Recall SCORE: 76.2%
{'learning_rate': 0.6511909235909645, 'max_depth': 1, 'n_estimators': 31}
Recall SCORE: 73.8%
{'learning_rate': 0.10323335991998198, 'max_depth': 3, 'n_estimators': 202}
Recall SCORE: 73.3%
{'learning_rate': 0.06031129633120212, 'max_depth': 4, 'n_estimators': 148}
Recall SCORE: 76.7%
{'learning_rate': 0.3905386823644004, 'max_depth': 1, 'n_estimators': 258}
Recall SCORE: 45.7%
{'learning_rate': 0.19939216874207194, 'max_depth': 6, 'n_estimators': 135}
Recall SCORE: 63.7%
{'learning_rate': 1.0241309977768722, 'max_depth': 5, 'n_estimators': 16}
Recall SCORE: 74.5%
{'learning_rate': 0.7866990445958566, 'max_depth': 1, 'n_estimators': 120}
Recall SCORE: 17.9%
{'learning_rate': 0.15336784389784808, 'max_depth': 8, 'n_estimators': 277}
Recall SCORE: 76.4%
{'learning_rate': 0.1122956439248968, 'max_depth': 2, 'n_estimators': 88}
Recall SCORE: 25.3%
{'learning_rate': 0.5349514257887628, 'max_depth': 7, 'n_estimators': 74}
Recall SCORE: 77.3%
{'learning_rate': 0.04983073963724779, 'max_depth': 1, 'n_estimators': 118}
Recall SCORE: 12.4%
{'learning_rate': 0.3250029493640733, 'max_depth': 9, 'n_estimators': 198}
Recall SCORE: 75.3%
{'learning_rate': 0.06666518434734173, 'max_depth': 3, 'n_estimators': 234}
Recall SCORE: 78.3%
{'learning_rate': 0.19376597843051782, 'max_depth': 1, 'n_estimators': 173}
Recall SCORE: 54.5%
{'learning_rate': 0.09066832627478387, 'max_depth': 6, 'n_estimators': 213}
Recall SCORE: 78.3%
{'learning_rate': 0.12370526468338958, 'max_depth': 1, 'n_estimators': 175}
Recall SCORE: 78.4%
{'learning_rate': 0.15611301844422007, 'max_depth': 1, 'n_estimators': 97}
Recall SCORE: 78.4%
{'learning_rate': 0.08522738047580274, 'max_depth': 1, 'n_estimators': 284}
Recall SCORE: 78.3%
{'learning_rate': 0.2427416890676046, 'max_depth': 1, 'n_estimators': 52}
Recall SCORE: 78.4%
{'learning_rate': 0.054888809010760764, 'max_depth': 1, 'n_estimators': 240}
Recall SCORE: 78.3%
{'learning_rate': 0.0714785530949075, 'max_depth': 1, 'n_estimators': 255}
Recall SCORE: 67.4%
{'learning_rate': 0.09457021352386104, 'max_depth': 4, 'n_estimators': 223}
Recall SCORE: 78.3%
{'learning_rate': 0.1258572854464324, 'max_depth': 1, 'n_estimators': 260}
Recall SCORE: 69.1%
{'learning_rate': 0.38365171013359206, 'max_depth': 2, 'n_estimators': 287}
Recall SCORE: 66.1%
{'learning_rate': 0.07748934224987733, 'max_depth': 7, 'n_estimators': 68}
Recall SCORE: 38.8%
{'learning_rate': 0.05374899965792541, 'max_depth': 8, 'n_estimators': 294}
Recall SCORE: 32.1%
{'learning_rate': 0.25757793654314104, 'max_depth': 5, 'n_estimators': 280}
Recall SCORE: 39.0%
{'learning_rate': 0.14151398578206295, 'max_depth': 9, 'n_estimators': 68}
Recall SCORE: 78.3%
{'learning_rate': 0.06360282886170435, 'max_depth': 1, 'n_estimators': 252}
Recall SCORE: 75.9%
{'learning_rate': 0.11223881638752854, 'max_depth': 3, 'n_estimators': 11}
Recall SCORE: 78.1%
{'learning_rate': 0.18395340575328425, 'max_depth': 1, 'n_estimators': 196}
Recall SCORE: 57.8%
{'learning_rate': 0.08394266335435978, 'max_depth': 6, 'n_estimators': 181}
Recall SCORE: 75.0%
{'learning_rate': 0.07084777742969654, 'max_depth': 4, 'n_estimators': 23}
Recall SCORE: 78.3%
{'learning_rate': 0.21180354632910325, 'max_depth': 1, 'n_estimators': 101}
Recall SCORE: 77.0%
{'learning_rate': 0.059614641772817924, 'max_depth': 2, 'n_estimators': 146}
Recall SCORE: 18.8%
{'learning_rate': 0.774463741573799, 'max_depth': 7, 'n_estimators': 67}
Recall SCORE: 45.0%
{'learning_rate': 0.36485223124369115, 'max_depth': 5, 'n_estimators': 115}
Recall SCORE: 64.3%
{'learning_rate': 0.09908138052113484, 'max_depth': 8, 'n_estimators': 46}
Recall SCORE: 19.7%
{'learning_rate': 0.48572788965754154, 'max_depth': 9, 'n_estimators': 43}
Recall SCORE: 78.1%
{'learning_rate': 0.32682849814571396, 'max_depth': 1, 'n_estimators': 161}
Recall SCORE: 69.7%
{'learning_rate': 0.16662663603638786, 'max_depth': 3, 'n_estimators': 238}
Recall SCORE: 77.8%
{'learning_rate': 0.3028944794775205, 'max_depth': 1, 'n_estimators': 201}
Recall SCORE: 79.7%
{'learning_rate': 0.10726980314505748, 'max_depth': 1, 'n_estimators': 71}
Recall SCORE: 63.5%
{'learning_rate': 0.063441233030227, 'max_depth': 6, 'n_estimators': 170}
Recall SCORE: 75.6%
{'learning_rate': 0.050175906096786536, 'max_depth': 4, 'n_estimators': 86}
Recall SCORE: 78.3%
{'learning_rate': 0.10434813269085623, 'max_depth': 1, 'n_estimators': 241}
Recall SCORE: 76.9%
{'learning_rate': 0.07927950607955792, 'max_depth': 2, 'n_estimators': 147}
Recall SCORE: 20.2%
{'learning_rate': 0.14883099232410887, 'max_depth': 8, 'n_estimators': 230}
Recall SCORE: 70.5%
{'learning_rate': 0.11816012455603639, 'max_depth': 7, 'n_estimators': 22}
Recall SCORE: 69.7%
{'learning_rate': 0.05856620034208548, 'max_depth': 5, 'n_estimators': 174}
100%|██████████| 100/100 [03:33<00:00,  2.13s/trial, best loss: -0.796583850931677]
Best set of hyperparameters:  {'learning_rate': 0.10726980314505748, 'max_depth': 1, 'n_estimators': 71}
CPU times: user 4min 17s, sys: 1.2 s, total: 4min 18s
Wall time: 3min 33s

#model_HT is the Hypertuned model

model_HT = xgb.XGBClassifier( scale_pos_weight=9, seed=seed, eval_metric='aucpr',  **best_params)

model_HT.fit(xgb_dict['X_train'], xgb_dict['y_train'], eval_set=xgb_dict['eval_set'], verbose=False)

# make predictions for test data
y_pred = model_HT.predict(xgb_dict['X_test'])

# evaluate predictions
fpr, tpr, thresholds = precision_recall_curve(xgb_dict['y_test'], y_pred, pos_label=1)
auc_pr = auc(fpr, tpr)
roc_auc = roc_auc_score(xgb_dict['y_test'], y_pred)

f1_HT = f1_score(xgb_dict['y_test'], y_pred)
recall_HT = recall_score(xgb_dict['y_test'], y_pred)
precision_HT = precision_score(xgb_dict['y_test'], y_pred)




print("roc_auc: %.2f%%" % (roc_auc * 100.0))
print("auc_pr: %.2f%%" % (auc_pr * 100.0))
print("Recall: {:.2f}%".format(recall_HT * 100.0))
print("Precision: {:.2f}%".format(precision_HT * 100.0))
print("F1: {:.2f}%".format(f1_HT * 100.0))

plt.figure(figsize=(4,3), dpi=150)
ax = plt.gca()

# Compute values for confusion matrix
log_cm = confusion_matrix(xgb_dict['y_test'], y_pred)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=["No Default", "Default"])

log_disp.plot(cmap='Blues',ax=ax)
ax.set_title('XGBoost')

roc_auc: 68.71%
auc_pr: 39.48%
Recall: 79.66%
Precision: 17.81%
F1: 29.11%

Text(0.5, 1.0, 'XGBoost')

kfold = KFold(n_splits=10, shuffle=True,random_state=0)
results = cross_val_score(model_HT, xgb_dict['X_train'], xgb_dict['y_train'],scoring='f1', cv=kfold)
print("K-fold f1 score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(model_HT, xgb_dict['X_train'], xgb_dict['y_train'],scoring='recall', cv=kfold)
print("K-fold recall score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

results = cross_val_score(model_HT, xgb_dict['X_train'], xgb_dict['y_train'],scoring='precision', cv=kfold)
print("K-fold precision score: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold f1 score: 29.39% (2.45%)
K-fold recall score: 77.63% (3.17%)
K-fold precision score: 18.16% (1.79%)

#Here we can check learning cruves for our model using aucpr
results_pre_opt = model_HT.evals_result()

plt.figure(figsize=(3.5, 3), dpi=150)
# plot learning curves
plt.plot(results_pre_opt['validation_0']['aucpr'], label='Train')
plt.plot(results_pre_opt['validation_1']['aucpr'], label='Test')
plt.ylabel('AUCPR')
plt.xlabel('estimators')
plt.title('XGBoost')
plt.legend()

<matplotlib.legend.Legend at 0x7fa1c9eb5e40>

fig, ax = plt.subplots(1,2,figsize=(18,10))
# plot feature importance
xgb.plot_importance(model_HT, max_num_features=None,ax=ax[0], ylabel=None,title='Feature importance - Gain - XGBoost', grid=False, importance_type='gain', values_format = "{v:.1f}")
ax[0].tick_params(axis='both', which='major', labelsize=10)
ax[0].tick_params(axis='both', which='minor', labelsize=10)
ax[0].set_xlabel('Gain')
ax[0].set_yticklabels(ax[0].get_yticklabels(), rotation=45)
ax[0].spines['top'].set_visible(False)
ax[0].spines['right'].set_visible(False)


xgb.plot_importance(model_HT, ax=ax[1], ylabel=None,title='Feature importance - F score - XGBoost', grid=False,  values_format = "{v:.1f}")
ax[1].tick_params(axis='both', which='major', labelsize=10)
ax[1].tick_params(axis='both', which='minor', labelsize=10)
ax[1].set_yticklabels(ax[1].get_yticklabels(), rotation=45)
ax[1].spines['top'].set_visible(False)
ax[1].spines['right'].set_visible(False)
#plt.savefig('feat_import_F.png', facecolor='white', transparent=False)

explainer = shap.Explainer(model_HT, xgb_dict['X_train'])
shap_values = explainer(xgb_dict['X_test'])

shap.plots.beeswarm(shap_values, show=False)

plt.title('XGBoost')

[21:44:55] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

Text(0.5, 1.0, 'XGBoost')

test_df = pd.read_csv(data_path + 'test.csv')

test_df = data_preprocess(test_df)
ids = test_df['id'].tolist()

13 rows with negative dependants, setting equal to 0 (min -1)

test_df_lr = test_df[lr_base_dict['X_train'].columns]

test_df_xgb = test_df[xgb_dict['X_train'].columns]

# make predictions for  test data using XGBoost
y_pred_xgb = model_HT.predict(test_df_xgb)
y_proba_xgb = model_HT.predict_proba(test_df_xgb)

# make predictions for (scaled) test data using Logistic Regression
scaler = preprocessing.StandardScaler().fit(lr_base_dict['X_train'])
test_df_lr_scaled = pd.DataFrame(scaler.transform(test_df_lr), columns=test_df_lr.columns)
y_pred_lr = lr_base_model.predict(test_df_lr_scaled)
y_proba_lr = lr_base_model.predict_proba(test_df_lr_scaled)

plt.figure(figsize=(4, 3), dpi=150)
plt.plot([0,1], [0,1], lw=1, color='k', label='1:1')
plt.scatter(y_proba_xgb[:, 1], y_proba_lr[:, 1], s=0.5)



plt.axvline(0.5, lw=1, color='k', ls=':')
plt.axhline(0.5, lw=1, color='k', ls=':')

plt.ylabel('LogReg prob')
plt.xlabel('XGBoost prob')
plt.legend()

<matplotlib.legend.Legend at 0x7fa1c95b1b10>

plt.figure(figsize=(4,3), dpi=150)
ax = plt.gca()

log_cm = confusion_matrix(y_pred_lr, y_pred_xgb)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=['No Default', 'Default'])
log_disp.plot(cmap='Blues',ax=ax)


ax.set_ylabel('LogReg')
ax.set_xlabel('XGBoost')

Text(0.5, 0, 'XGBoost')

submission_df = pd.DataFrame({'id':ids, 'prediction':y_proba_xgb[:,1], 'outcome':y_pred_xgb})

submission_df.to_csv(data_path + 'submission.csv', index=False)

	id	number_dependants	credit_utilization	debt_to_income_ratio	monthly_income	number_open_credit_lines	number_open_loans	number_90_days_past_due	number_charged_off	score1	score2	target
count	20839.000000	20839.000000	2.083900e+04	20839.000000	20839.000000	20839.000000	20839.000000	20839.000000	20839.000000	20839.000000	20839.00000	20839.000000
mean	10923.946878	0.483421	4.815929e-02	0.333507	2290.772110	4.997745	2.027497	0.098901	0.098901	669.511637	669.29440	0.102164
std	6295.585008	0.719653	4.571744e-02	0.117332	320.090288	2.248021	1.431671	0.314963	0.315420	98.005767	98.60763	0.302871
min	1.000000	-1.000000	3.932283e-07	0.029386	2000.000000	0.000000	0.000000	0.000000	0.000000	500.000000	500.00000	0.000000
25%	5482.000000	0.000000	1.436676e-02	0.247782	2100.000000	3.000000	1.000000	0.000000	0.000000	584.000000	583.00000	0.000000
50%	10929.000000	0.000000	3.478343e-02	0.325432	2200.000000	5.000000	2.000000	0.000000	0.000000	669.000000	670.00000	0.000000
75%	16366.500000	1.000000	6.789546e-02	0.411210	2400.000000	6.000000	3.000000	0.000000	0.000000	755.000000	754.00000	0.000000
max	21839.000000	6.000000	5.387018e-01	0.801838	5000.000000	15.000000	9.000000	3.000000	3.000000	839.000000	839.00000	1.000000

	date_of_birth	age
422	1829-06-03	194
1714	1829-06-09	194
20259	1830-07-05	193
5192	1858-04-07	165
12348	1886-06-30	137
...	...	...
9708	1999-05-16	24
18588	1999-05-17	24
1332	1999-05-18	24
6244	1999-05-20	24
10883	1999-05-22	24

	monthly_income	score_average
6342	2000	709.0
977	2700	556.5
11172	2300	795.5
6934	2100	553.5
8903	2000	777.5
...	...	...
13128	2000	702.0
19654	2000	747.5
9849	2000	762.0
10803	2500	598.0
2734	2600	648.0

	number_dependants	credit_utilization	debt_to_income_ratio	monthly_income	number_open_credit_lines	number_open_loans	number_90_days_past_due	number_charged_off	age	score_average
6342	0	0.046453	0.242645	2000	3	1	0	0	62	709.0
977	0	0.090836	0.243761	2700	9	2	0	1	87	556.5
11172	0	0.067976	0.209619	2300	5	1	0	0	58	795.5
6934	1	0.156632	0.486520	2100	5	1	0	0	60	553.5
8903	0	0.013837	0.417057	2000	5	2	0	0	36	777.5
...	...	...	...	...	...	...	...	...	...	...
13128	0	0.015057	0.230730	2000	4	0	0	0	79	702.0
19654	0	0.107423	0.323088	2000	6	2	0	0	73	747.5
9849	0	0.057702	0.338929	2000	3	3	0	0	89	762.0
10803	0	0.127954	0.115117	2500	2	2	0	0	72	598.0
2734	1	0.038035	0.205415	2600	8	2	0	0	45	648.0

Purpose Financial Data Challenge: Predicting Customer Default¶

Problem:¶

Summary of results¶

Exploratory Data Analysis (EDA)¶

Age of customers¶

Pairplot¶

Correlation Matrix¶

Bars and histograms¶

EDA Summary¶

Data Cleaning, Feature Engineering, and Feature selection¶

Data cleaning:¶

Feature Engineering:¶

Feature Selection¶

Modeling¶

Train-Test-Split¶

Logistic Regression¶

Initial Results¶

Interpretations and model explanations¶

Optimizing class weights¶

Additional features¶

Logistic Regression Summary¶

XGBoost Classifier¶

Initial Results¶

Hyperparameter Tuning¶

Feature importances and SHAP Values¶

XGBoost Summary¶

Modeling Results¶

Testing Data and Submission file¶

Conclusions¶

	id	date_of_birth	number_dependants	credit_utilization	debt_to_income_ratio	monthly_income	number_open_credit_lines	number_open_loans	number_charged_off	score1	score2
0	1	1993-03-14	1	0.036495	0.208897	2400	6	2	0	570	817
1	2	1966-09-27	1	0.023423	0.260438	2200	6	1	0	741	756
2	3	1931-01-11	2	0.027205	0.335922	3000	6	1	1	805	779
3	4	1953-01-07	1	0.009141	0.353779	2100	2	4	0	573	829
4	5	1943-11-17	0	0.068424	0.314966	2500	2	3	0	833	629