# Import packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


from xgboost import XGBClassifier, plot_importance


from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import KFold,  cross_val_score # for validation of the model

from sklearn.metrics import classification_report, accuracy_score, precision_score, \
recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay


df0 = pd.read_csv("HR_capstone_dataset.csv")
df0.sample(5)


# Gather basic information about the data
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


df0.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')


# Gather descriptive statistics about the data
df0.describe()


# Display all column names
df0.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')


# Rename columns as needed
col_renames = {'number_project':'number_of_projects',
       'average_montly_hours':'average_monthly_hours',
        'time_spend_company':'years_at_company', 
        'Work_accident':'work_accident',
       'Department':'department'}
df0.rename(columns=col_renames, inplace=True)
# Display all column names after the update
df0.columns

Index(['satisfaction_level', 'last_evaluation', 'number_of_projects',
       'average_monthly_hours', 'years_at_company', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')


# Check for missing values
df0.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_of_projects       0
average_monthly_hours    0
years_at_company         0
work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64


# Check for duplicates
df0.duplicated().sum()

3008


# Inspect some rows containing duplicates as needed
df0[df0.duplicated()]


# Drop duplicates and save resulting dataframe in a new variable as needed
df1=df0[~df0.duplicated()]


# Display first few rows of new dataframe as needed
df1.sample(5)


# Check balance of dataset
print(df1['left'].value_counts(normalize=True))

0    0.833959
1    0.166041
Name: left, dtype: float64


df1.columns

Index(['satisfaction_level', 'last_evaluation', 'number_of_projects',
       'average_monthly_hours', 'years_at_company', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')


#list of numerical columns
cont_cols = ['satisfaction_level', 'last_evaluation', 'average_monthly_hours']

target_var='left'


data=df1
for col in cont_cols:
    fig, ax = plt.subplots(1, 1, figsize=(4,3))


    bins = np.linspace(np.nanmin(data[col]), np.nanmax(data[col]), 50)
    ax.hist(data[data[target_var]==1][col], label=target_var, bins=bins, alpha=0.5, edgecolor='grey')
    ax.hist(data[data[target_var]==0][col], label='not '+target_var, bins=bins, alpha=0.5, edgecolor='grey')
    ax.legend()
    ax.set_xlabel(col)


#list of numerical columns
cont_cols = ['satisfaction_level', 'last_evaluation', 'average_monthly_hours']

check_var='salary'


data=df1
for col in cont_cols:
    fig, ax = plt.subplots(1, 1, figsize=(4,3))


    bins = np.linspace(np.nanmin(data[col]), np.nanmax(data[col]), 50)
    
    for label in data[check_var].unique():
        ax.hist(data[data[check_var]==label][col], label=label, bins=bins, alpha=0.5, edgecolor='grey', density=True)
    
    ax.legend()
    ax.set_xlabel(col)


#list of numerical columns
disc_cols = ['number_of_projects', 'years_at_company', 'work_accident',
       'promotion_last_5years', 'department', 'salary']

target_var='left'

annotate=True
data=df1
for col in disc_cols:
    fig, ax = plt.subplots(1, 1, figsize=(8,6))


    #gb = df1.groupby([target_var, col])[col].count().reset_index()
    
    gb = df1.groupby([col, target_var])[target_var]
    print(df1.groupby([col])[target_var].value_counts(normalize=True))
    gb.count().unstack().plot(kind='barh', legend=True, color=['b', 'r' ], ax=ax)
    for c, container in enumerate(ax.containers):
        
        for i, p in enumerate(container):
            group_total = ax.containers[0][i].get_width() + ax.containers[1][i].get_width()
            
            width = p.get_width()
            height = p.get_height()
            x, y = p.get_xy()
            
            print(width/group_total)
            
            if annotate:
                ax.annotate('{:.1f}%'.format(100*width/group_total), (x +width + 5, i + (c-1)*0.25  + 0.05), ha='left')
                
            #print(f'{(width)}')
    print('\n')

number_of_projects  left
2                   1       0.541719
                    0       0.458281
3                   0       0.989205
                    1       0.010795
4                   0       0.935685
                    1       0.064315
5                   0       0.846395
                    1       0.153605
6                   0       0.550847
                    1       0.449153
7                   1       1.000000
Name: left, dtype: float64
0.4582806573957016
0.9892045454545455
0.935685210312076
0.8463949843260188
0.5508474576271186
0.0
0.5417193426042983
0.010795454545454546
0.06431478968792402
0.1536050156739812
0.4491525423728814
1.0


years_at_company  left
2                 0       0.989347
                  1       0.010653
3                 0       0.831599
                  1       0.168401
4                 0       0.753117
                  1       0.246883
5                 0       0.546139
                  1       0.453861
6                 0       0.798893
                  1       0.201107
7                 0       1.000000
8                 0       1.000000
10                0       1.000000
Name: left, dtype: float64
0.9893470790378007
0.8315992292870905
0.7531172069825436
0.5461393596986818
0.7988929889298892
1.0
1.0
1.0
0.010652920962199313
0.16840077071290943
0.24688279301745636
0.4538606403013183
0.2011070110701107
0.0
0.0
0.0


work_accident  left
0              0       0.814022
               1       0.185978
1              0       0.943243
               1       0.056757
Name: left, dtype: float64
0.8140222857706341
0.9432432432432433
0.18597771422936593
0.05675675675675676


promotion_last_5years  left
0                      0       0.831778
                       1       0.168222
1                      0       0.960591
                       1       0.039409
Name: left, dtype: float64
0.8317780794027825
0.9605911330049262
0.16822192059721752
0.03940886699507389


department   left
IT           0       0.838115
             1       0.161885
RandD        0       0.877522
             1       0.122478
accounting   0       0.824477
             1       0.175523
hr           0       0.811980
             1       0.188020
management   0       0.880734
             1       0.119266
marketing    0       0.833581
             1       0.166419
product_mng  0       0.839650
             1       0.160350
sales        0       0.830195
             1       0.169805
support      0       0.828666
             1       0.171334
technical    0       0.826203
             1       0.173797
Name: left, dtype: float64
0.8381147540983607
0.877521613832853
0.8244766505636071
0.8119800332778702
0.8807339449541285
0.8335809806835067
0.8396501457725948
0.8301945044766903
0.828665568369028
0.8262032085561497
0.16188524590163936
0.12247838616714697
0.17552334943639292
0.18801996672212978
0.11926605504587157
0.1664190193164933
0.16034985422740525
0.16980549552330967
0.171334431630972
0.17379679144385027


salary  left
high    0       0.951515
        1       0.048485
low     0       0.795470
        1       0.204530
medium  0       0.853830
        1       0.146170
Name: left, dtype: float64
0.9515151515151515
0.7954703832752613
0.8538300703288348
0.048484848484848485
0.20452961672473868
0.14616992967116518


corr = df1.corr()

trimask = np.triu(np.ones_like(corr, dtype=bool))

# Create a heatmap to visualize how correlated variables are
plt.figure(figsize=(8, 6))
sns.heatmap(corr,
    annot=True,
    cmap="crest", mask=trimask, vmax=1)
plt.title("Heatmap of the dataset")
plt.show()


corr1 = df1[df1['left']==1].drop(columns=['left']).corr()
corr2 = df1[df1['left']==0].drop(columns=['left']).corr()
trimask = np.triu(np.ones_like(corr1, dtype=bool))

# Create a heatmap to visualize how correlated variables are
fig, ax = plt.subplots(1,2, figsize=(16,10))
sns.heatmap(corr1,
    annot=True,
    cmap="crest", mask=trimask, vmax=1, ax=ax[0], cbar=False)
ax[0].set_title('Left')
sns.heatmap(corr2,
    annot=True,
    cmap="crest", mask=trimask, vmax=1, ax=ax[1], cbar=False)
ax[1].set_yticklabels([])
ax[1].set_title('Retained')
plt.tight_layout()


df_aff = df1[df1[target_var]==1]
df_neg = df1[df1[target_var]==0]

xvar = 'satisfaction_level'
yvar = 'last_evaluation'

plt.figure(figsize=(4,3), dpi=150)

plt.scatter(df_neg[xvar], df_neg[yvar], alpha=0.5, s=0.5, label='0')
plt.scatter(df_aff[xvar], df_aff[yvar], alpha=0.5, s=0.5, label='1')

ax =plt.gca()

sns.kdeplot(data=df_aff, x=xvar, y=yvar,  fill=False , ax=ax, color='orangered',levels=[0.4, 0.65])

plt.xlabel(xvar)
plt.ylabel(yvar)

Text(0, 0.5, 'last_evaluation')


plt.figure(figsize=(4,3), dpi=150)
sns.kdeplot(data=df1, x=xvar, y=yvar, hue='left', fill=False)

<AxesSubplot:xlabel='satisfaction_level', ylabel='last_evaluation'>


le = LabelEncoder()
le.fit(df1['salary'])

df1['salary_enc'] = le.transform(df1['salary'])
df1[['salary','salary_enc']]

/local_scratch/pbs.1650173.pbs02/ipykernel_3313054/105948238.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['salary_enc'] = le.transform(df1['salary'])


# This is an ordinal category, and I would prefer it to be in order of low->high
# easy enough to do maually for only three categories

salary_dict = {'high':3, 'medium':2, "low":0}
df1['salary_enc'] = df1['salary'].replace(salary_dict)
df1[['salary','salary_enc']]

/local_scratch/pbs.1650173.pbs02/ipykernel_3313054/263057460.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['salary_enc'] = df1['salary'].replace(salary_dict)


df1.columns

Index(['satisfaction_level', 'last_evaluation', 'number_of_projects',
       'average_monthly_hours', 'years_at_company', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary', 'salary_enc'],
      dtype='object')


# These columns will be dropped from the training data
drop_cols = [ 'left', 'department','salary']


fit_df = df1


X, y = fit_df.drop(columns=drop_cols), fit_df[target_var]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
evalset = [(X_train, y_train), (X_test,y_test)]


try:
    del model
except:
    print("model does not exist, will instantiate")

# Here we will define our first baseline version of the model, using mainly default parameters, but increse n_estimators to get more learning curve coverage
# After a first run with 200 estimators, looks like ~20 is best
n_estimators = 20
seed = 0

model = XGBClassifier(n_estimators=n_estimators, seed=seed, eval_metric='logloss')

model.fit(X_train, y_train,  eval_set=evalset,verbose=False)


# make predictions for test data
y_pred = model.predict(X_test)

# evaluate predictions
recall= recall_score(y_test, y_pred)
print("Train-Test-Split recall: %.2f%%" % (recall * 100.0))
# evaluate predictions
f1= f1_score(y_test, y_pred)
print("Train-Test-Split f1: %.2f%%" % (f1 * 100.0))

Train-Test-Split recall: 91.04%
Train-Test-Split f1: 94.01%


kfold = KFold(n_splits=10, shuffle=True,random_state=0)
results = cross_val_score(model, X_train, y_train, cv=kfold)
print("K-fold Accuracy: {:.2f}% ({:.2f}%)".format(results.mean()*100, results.std()*100))

K-fold Accuracy: 98.32% (0.33%)


#Here we can check learning cruves for our model using logloss
results_pre_opt = model.evals_result()
plt.figure(figsize=(3.5, 3), dpi=150)
# plot learning curves
plt.plot(results_pre_opt['validation_0']['logloss'], label='Train')
plt.plot(results_pre_opt['validation_1']['logloss'], label='Test')
plt.ylabel('logloss')
plt.legend()

<matplotlib.legend.Legend at 0x1516411dceb0>


plt.figure(figsize=(4,3), dpi=150)
# Compute values for confusion matrix
log_cm = confusion_matrix(y_test, y_pred)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=None)

try:
    log_disp.plot(cmap='Blues',ax=plt.gca())
except:
    pass
# Display plot
#plt.show()


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2995
           1       0.97      0.91      0.94       603

    accuracy                           0.98      3598
   macro avg       0.98      0.95      0.96      3598
weighted avg       0.98      0.98      0.98      3598


fig, ax = plt.subplots(1,1,figsize=(8,10))
# plot feature importance
plot_importance(model, max_num_features=10,ax=ax, ylabel=None)
ax.tick_params(axis='both', which='major', labelsize=13)
ax.tick_params(axis='both', which='minor', labelsize=13)

plt.tight_layout()
#plt.savefig('feat_import_F.png', facecolor='white', transparent=False)


# Get default feature_importance values (i.e. gain) and create a sorted dictionary
feature_names  = X_train.columns.tolist()
feat_imp_dict = dict(zip(feature_names, model.feature_importances_))
feat_imp_dict = {k: v for k, v in sorted(feat_imp_dict.items(), key=lambda item: item[1], reverse=True)}


#plot the feature importances
fig, ax = plt.subplots(1,1,figsize=(8,10))




#limt to top 15
limit = 10
ax.grid(zorder=0)
ax.barh(list(feat_imp_dict.keys())[:limit], list(feat_imp_dict.values())[:limit], height=0.3)
ax.invert_yaxis()
ax.set_xlabel('gain', fontsize=15)
#ax.set_ylabel('Features')
ax.tick_params(axis='both', which='major', labelsize=13)
ax.tick_params(axis='both', which='minor', labelsize=13)


plt.title('Feature Importance')
plt.tight_layout()
#plt.savefig('feat_import_gain.png', facecolor='white', transparent=False)


from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

hyp_params = {
    
    'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'subsample': hp.uniform('subsample', 0.25, 1),
    #'gamma': hp.uniform ('gamma', 0,2),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'scale_pos_weight' : hp.uniform('scale_pos_weight', 1,2),
}


n_estimators

20


# An objective function is needed for the hyperopt tuning
def objective(hyp_params):
    clf=XGBClassifier(n_estimators=n_estimators,seed=seed,eval_metric='logloss', **hyp_params)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,  verbose=False)
    

    pred = clf.predict(X_test)
    accuracy_clf = f1_score(y_test, pred)
    print ("SCORE: {:.3f}".format( accuracy_clf))
    return {'loss': -accuracy_clf, 'status': STATUS_OK }


trials = Trials()
best_params = fmin(objective, hyp_params, algo=tpe.suggest, max_evals=100, trials=trials)
print("Best set of hyperparameters: ", best_params)

SCORE: 0.919                                           
SCORE: 0.924                                                                      
SCORE: 0.937                                                                      
SCORE: 0.935                                                                      
SCORE: 0.928                                                                     
SCORE: 0.932                                                                     
SCORE: 0.936                                                                     
SCORE: 0.921                                                                     
SCORE: 0.935                                                                     
SCORE: 0.925                                                                     
SCORE: 0.931                                                                      
SCORE: 0.931                                                                      
SCORE: 0.939                                                                      
SCORE: 0.926                                                                       
SCORE: 0.927                                                                       
SCORE: 0.933                                                                       
SCORE: 0.927                                                                       
SCORE: 0.925                                                                       
SCORE: 0.929                                                                       
SCORE: 0.920                                                                       
SCORE: 0.939                                                                       
SCORE: 0.940                                                                       
SCORE: 0.943                                                                       
SCORE: 0.943                                                                       
SCORE: 0.949                                                                       
SCORE: 0.947                                                                       
SCORE: 0.943                                                                       
SCORE: 0.933                                                                       
SCORE: 0.948                                                                       
SCORE: 0.942                                                                       
SCORE: 0.943                                                                       
SCORE: 0.948                                                                       
SCORE: 0.937                                                                       
SCORE: 0.940                                                                       
SCORE: 0.941                                                                       
SCORE: 0.932                                                                       
SCORE: 0.941                                                                       
SCORE: 0.934                                                                       
SCORE: 0.942                                                                       
SCORE: 0.936                                                                       
SCORE: 0.932                                                                       
SCORE: 0.940                                                                       
SCORE: 0.929                                                                       
SCORE: 0.936                                                                       
SCORE: 0.931                                                                       
SCORE: 0.932                                                                       
SCORE: 0.944                                                                       
SCORE: 0.943                                                                       
SCORE: 0.937                                                                       
SCORE: 0.940                                                                       
SCORE: 0.933                                                                       
SCORE: 0.932                                                                       
SCORE: 0.936                                                                       
SCORE: 0.922                                                                       
SCORE: 0.944                                                                       
SCORE: 0.941                                                                       
SCORE: 0.927                                                                       
SCORE: 0.931                                                                       
SCORE: 0.932                                                                       
SCORE: 0.940                                                                       
SCORE: 0.940                                                                       
SCORE: 0.949                                                                       
SCORE: 0.937                                                                       
SCORE: 0.936                                                                       
SCORE: 0.943                                                                       
SCORE: 0.947                                                                       
SCORE: 0.944                                                                       
SCORE: 0.948                                                                       
SCORE: 0.941                                                                       
SCORE: 0.943                                                                       
SCORE: 0.948                                                                       
SCORE: 0.944                                                                       
SCORE: 0.941                                                                       
SCORE: 0.940                                                                       
SCORE: 0.938                                                                       
SCORE: 0.947                                                                       
SCORE: 0.941                                                                       
SCORE: 0.940                                                                       
SCORE: 0.945                                                                       
SCORE: 0.932                                                                       
SCORE: 0.939                                                                       
SCORE: 0.938                                                                       
SCORE: 0.941                                                                       
SCORE: 0.941                                                                       
SCORE: 0.939                                                                       
SCORE: 0.943                                                                       
SCORE: 0.924                                                                       
SCORE: 0.940                                                                       
SCORE: 0.936                                                                       
SCORE: 0.946                                                                       
SCORE: 0.940                                                                       
SCORE: 0.937                                                                       
SCORE: 0.934                                                                       
SCORE: 0.938                                                                       
SCORE: 0.939                                                                       
SCORE: 0.943                                                                       
SCORE: 0.935                                                                       
SCORE: 0.938                                                                       
SCORE: 0.945                                                                       
SCORE: 0.946                                                                       
100%|██████████| 100/100 [00:22<00:00,  4.53trial/s, best loss: -0.9485420240137222]
Best set of hyperparameters:  {'colsample_bytree': 0.6547164033139872, 'learning_rate': 0.11320931065700798, 'max_depth': 6, 'min_child_weight': 0.0, 'scale_pos_weight': 1.8823124437047563, 'subsample': 0.9342032119027921}


#model_HT is the Hypertuned model

model_HT = XGBClassifier(n_estimators=n_estimators, seed=seed, eval_metric='logloss',  **best_params)


eval_set = [( X_train, y_train), ( X_test, y_test)]

model_HT.fit(X_train, y_train, eval_set=eval_set, verbose=False)


# make predictions for test data
y_pred = model_HT.predict(X_test)

# evaluate predictions
f1_HT = f1_score(y_test, y_pred)
print("Train-test-split F1: {:.2f}%".format(f1_HT * 100.0))

Train-test-split F1: 94.61%


plt.figure(figsize=(4,3), dpi=150)
# Compute values for confusion matrix
log_cm = confusion_matrix(y_test, y_pred)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=None)

try:
    log_disp.plot(cmap='Blues',ax=plt.gca())
except:
    pass
# Display plot
#plt.show()


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2995
           1       0.98      0.92      0.95       603

    accuracy                           0.98      3598
   macro avg       0.98      0.96      0.97      3598
weighted avg       0.98      0.98      0.98      3598

Variable	Description
satisfaction_level	Employee-reported job satisfaction level [0–1]
last_evaluation	Score of employee's last performance review [0–1]
number_project	Number of projects employee contributes to
average_monthly_hours	Average number of hours employee worked per month
time_spend_company	How long the employee has been with the company (years)
Work_accident	Whether or not the employee experienced an accident while at work
left	Whether or not the employee left the company
promotion_last_5years	Whether or not the employee was promoted in the last 5 years
Department	The employee's department
salary	The employee's salary (U.S. dollars)

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.238083	0.021268
std	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.425924	0.144281
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

Salifort Motors Employee Retention Predictor¶

Business scenario and problem¶

HR Dataset¶

Import packages¶

Load dataset¶

Data Exploration (Initial EDA and data cleaning)¶

Gather basic information about the data¶

Gather descriptive statistics about the data¶

Rename columns¶

Check missing values¶

Check duplicates¶

EDA ToDo¶

Model Construction¶

Encode labels¶

Setup Training¶

Hyperparameter Tuning¶

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	Department	salary
11879	0.78	0.80	3	256	2	0	0	IT	medium
10740	0.97	0.83	3	238	2	0	0	support	medium
6655	0.95	0.49	4	178	2	0	0	sales	low
12347	0.11	0.80	6	282	4	0	1	technical	medium
2926	0.82	0.84	3	139	2	1	0	sales	low

	satisfaction_level	last_evaluation	number_of_projects	average_monthly_hours	years_at_company	work_accident	left	promotion_last_5years	department	salary
396	0.46	0.57	2	139	3	0	1	0	sales	low
866	0.41	0.46	2	128	3	0	1	0	accounting	low
1317	0.37	0.51	2	127	3	0	1	0	sales	medium
1368	0.41	0.52	2	132	3	0	1	0	RandD	low
1461	0.42	0.53	2	142	3	0	1	0	sales	low
...	...	...	...	...	...	...	...	...	...	...
14994	0.40	0.57	2	151	3	0	1	0	support	low
14995	0.37	0.48	2	160	3	0	1	0	support	low
14996	0.37	0.53	2	143	3	0	1	0	support	low
14997	0.11	0.96	6	280	4	0	1	0	support	low
14998	0.37	0.52	2	158	3	0	1	0	support	low

	satisfaction_level	last_evaluation	number_of_projects	average_monthly_hours	years_at_company	work_accident	left	department	salary
11867	0.56	0.59	3	223	3	0	0	support	medium
4789	0.97	0.91	4	257	3	1	0	technical	low
9548	0.61	0.37	4	165	6	0	0	marketing	medium
1006	0.40	0.47	2	151	3	0	1	sales	medium
1396	0.20	0.70	6	281	5	0	1	sales	medium

	salary	salary_enc
0	low	1
1	medium	2
2	medium	2
3	low	1
4	low	1
...	...	...
11995	high	0
11996	high	0
11997	high	0
11998	high	0
11999	low	1