Predicting Loan-Defaulting: Bagging Modeling
Here, the work to build a model that predicts if someone who is looking for a loan will default on the load.
In [1]:
import time
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
# warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
In [2]:
url = "credit.csv"
creditData = pd.read_csv(url)
creditData.head(10)
Out [2]:
In [3]:
rowCount, colCount = creditData.shape
print(f'{rowCount} rows')
print(f'{colCount} columns')
In [4]:
creditData.describe()
Out [4]:
In [5]:
creditData.info()
In [6]:
for feature in creditData.columns:
if creditData[feature].dtype == 'object':
creditData[feature] = pd.Categorical(creditData[feature])
creditData.head(10)
Out [6]:
In [7]:
print(creditData.checking_balance.value_counts())
print(creditData.credit_history.value_counts())
print(creditData.purpose.value_counts())
print(creditData.savings_balance.value_counts())
print(creditData.employment_duration.value_counts())
print(creditData.other_credit.value_counts())
print(creditData.housing.value_counts())
print(creditData.job.value_counts())
print(creditData.phone.value_counts())
Data Cleanup: Categorical Strings to Integers
Make the data easier to process by manually converting values to integers.In [8]:
replaceStruct = {
"checking_balance": {
"< 0 DM": 1,
"1 - 200 DM": 2,
"> 200 DM": 3 ,
"unknown":-1
},
"credit_history": {
"critical": 1,
"poor":2 ,
"good": 3,
"very good": 4,
"perfect": 5
},
"savings_balance": {
"< 100 DM": 1,
"100 - 500 DM":2 ,
"500 - 1000 DM": 3,
"> 1000 DM": 4,
"unknown": -1
},
"employment_duration": {
"unemployed": 1,
"< 1 year": 2,
"1 - 4 years": 3,
"4 - 7 years": 4,
"> 7 years": 5
},
"phone": {"no": 1, "yes": 2 },
#"job": {"unemployed": 1, "unskilled": 2, "skilled": 3, "management": 4 },
"default": {"no": 0, "yes": 1 }
}
creditData=creditData.replace(replaceStruct)
In [9]:
oneHotCols=["purpose","housing","other_credit","job"]
creditData=pd.get_dummies(creditData, columns=oneHotCols)
In [10]:
creditData.head(10)
Out [10]:
In [11]:
creditData.info()
In [12]:
creditData['default'].value_counts()
Out [12]:
This data has an "uneven" distirbution of the target class. This is not a 50/50 split, and not even really close. In cases like this, stratified sampling can be used to ensure that relative class frequencies (here 700/300) are approximately preserved in training and testing datasets.
In [13]:
X = creditData.drop("default" , axis=1)
y = creditData.pop("default")
In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1,stratify=y)
In [15]:
## Function to create confusion matrix
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [16]:
def get_metrics_score(model,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list=[]
#Predict
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
#Accuracy
train_acc = model.score(X_train,y_train)
test_acc = model.score(X_test,y_test)
#Recall
train_recall = metrics.recall_score(y_train,pred_train)
test_recall = metrics.recall_score(y_test,pred_test)
#Precision
train_precision = metrics.precision_score(y_train,pred_train)
test_precision = metrics.precision_score(y_test,pred_test)
# UPDATE the score_list
score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print('Training Data:')
print("Accuracy: ",model.score(X_train,y_train))
print("Recall: ",metrics.recall_score(y_train,pred_train))
print("Precision: ",metrics.precision_score(y_train,pred_train))
print('')
print('Testing Data:')
print("Accuracy: ",model.score(X_test,y_test))
print("Recall: ",metrics.recall_score(y_test,pred_test))
print("Precision: ",metrics.precision_score(y_test,pred_test))
return score_list # returning the list with train and test scores
Building Models
Here, a Bagging Classifier and a Random Forest Classifier will be built. Default parameters will be used and then hyperparameter tuning will be used.Model metrics will be calculated: Accuracy, Precision and Recall. Recall is the specific metric of interest here, as recall gives the ratio of True positives to Actual positives. High Recall indicates low false negatives, i.e. low chances of predicting a defaulter as non defaulter.
In [17]:
bagging_estimator=BaggingClassifier(random_state=1)
bagging_estimator.fit(X_train,y_train)
Out [17]:
In [18]:
#Using above defined function to get accuracy, recall and precision on train and test set
bagging_estimator_score=get_metrics_score(bagging_estimator)
In [19]:
make_confusion_matrix(bagging_estimator,y_test)
In [20]:
rf_estimator=RandomForestClassifier(random_state=1)
rf_estimator.fit(X_train,y_train)
Out [20]:
In [21]:
rf_estimator_score=get_metrics_score(rf_estimator)
In [22]:
make_confusion_matrix(rf_estimator,y_test)
Default-Parameter Summary
Both models overfit to the training data.Bagging classifier has a better recall score.
Some of the important hyperparameters available for bagging classifier are:
base_estimator
: The base estimator to fit on random subsets of the dataset. If None(default), then the base estimator is a decision tree.n_estimators
: The number of trees in the forest, default = 100.max_features
: The number of features to consider when looking for the best split.bootstrap
: Whether bootstrap samples are used when building trees. If False, the entire dataset is used to build each tree, default=True.bootstrap_features
: If it is true, then features are drawn with replacement. Default value is False.max_samples
: If bootstrap is True, then the number of samples to draw from X to train each base estimator. If None (default), then draw N samples, where N is the number of observations in the train data.oob_score
: Whether to use out-of-bag samples to estimate the generalization accuracy, default=False.
Baggin Classifier: Grid of Hyperparameters
In [23]:
bagging_estimator_tuned = BaggingClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {'max_samples': [0.7,0.8,0.9,1],
'max_features': [0.7,0.8,0.9,1],
'n_estimators' : [10,20,30,40,50],
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
In [24]:
# Run the grid search
grid_obj = GridSearchCV(bagging_estimator_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
In [25]:
# Set the clf to the best combination of parameters
bagging_estimator_tuned = grid_obj.best_estimator_
In [26]:
# Fit the best algorithm to the data.
bagging_estimator_tuned.fit(X_train, y_train)
Out [26]:
In [27]:
#Using above defined function to get accuracy, recall and precision on train and test set
bagging_estimator_tuned_score=get_metrics_score(bagging_estimator_tuned)
In [28]:
make_confusion_matrix(bagging_estimator_tuned,y_test)
Summary
- Training accuracy and recall have increased slightly. Test recall, though, has decreased- The model is overfitting to the training data.
- The model is better at identifying non-defaulters as compared to defaulters (see confusion matrix)
Bagging Classifier with Logistic regression As The Base Estimator
By default, the base estimator is a decision tree. Here, using a Logistic Regression.In [29]:
bagging_lr=BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1,max_iter=1000),random_state=1)
bagging_lr.fit(X_train,y_train)
Out [29]:
In [30]:
bagging_lr_score=get_metrics_score(bagging_lr)
In [31]:
make_confusion_matrix(bagging_lr,y_test)
Summary
- This option does NOT overfit to the training data.In [32]:
# build a classifier instance
rf_estimator_tuned = RandomForestClassifier(random_state=1)
In [33]:
# Grid of parameters to choose from
parameters = {
"n_estimators": [150,200,250],
"min_samples_leaf": np.arange(5, 10),
"max_features": np.arange(0.2, 0.7, 0.1),
"max_samples": np.arange(0.3, 0.7, 0.1),
}
In [34]:
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
In [35]:
# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring=acc_scorer,cv=5, n_jobs=4)
In [36]:
# fit training data to the gridSearchCV
grid_obj = grid_obj.fit(X_train, y_train)
In [37]:
#
# get the best-estimated grid from the combinations of parameters
#
rf_estimator_tuned = grid_obj.best_estimator_
#
# fit the training data to the best-estimator
#
rf_estimator_tuned.fit(X_train, y_train)
Out [37]:
In [38]:
#Using above defined function to get accuracy, recall and precision on train and test set
rf_estimator_tuned_score=get_metrics_score(rf_estimator_tuned)
In [39]:
make_confusion_matrix(rf_estimator_tuned,y_test)
Summary
- The best estimator here performs better than the default random forest- This is overfitting to the testing data, but not as much as the tuned bagging classifier
- The test recall is still very low: the model is not that good at identifying defaulters
Random Forest Classifier: with Classification Weights
Theclass_weight
hyperparameter specifies the weights associated with classes. Here, the classification weights will reflect the percentages of each class in the data: 30% and 70%.In [40]:
# Choose the type of classifier.
rf_estimator_weighted = RandomForestClassifier(random_state=1)
In [51]:
# Grid of parameters to choose from
## add from article
parameters = {
"class_weight": [{0: 0.7, 1: 0.3}],
"n_estimators": [100,150,200,250],
"min_samples_leaf": np.arange(5, 10),
"max_features": np.arange(0.2, 0.7, 0.1),
"max_samples": np.arange(0.3, 0.7, 0.1),
}
In [52]:
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
In [53]:
# Run the grid search
start_time = time.perf_counter()
grid_obj = GridSearchCV(rf_estimator_weighted, parameters, scoring=acc_scorer,cv=5, n_jobs=4)
end_time = time.perf_counter()
execution_time = end_time - start_time
print(f'execution_time: {execution_time}')
In [54]:
start_time = time.perf_counter()
grid_obj = grid_obj.fit(X_train, y_train)
end_time = time.perf_counter()
execution_time = end_time - start_time
print(f'execution_time: {execution_time}')
In [45]:
# Set the clf to the best combination of parameters
rf_estimator_weighted = grid_obj.best_estimator_
# Fit the best algorithm to the data.
rf_estimator_weighted.fit(X_train, y_train)
Out [45]:
In [46]:
#Using above defined function to get accuracy, recall and precision on train and test set
rf_estimator_weighted_score=get_metrics_score(rf_estimator_weighted)
In [47]:
make_confusion_matrix(rf_estimator_weighted,y_test)
Summary
This is the best model so far:- Accuracy has decreased a bit
- overfitting has decreased
- The train and test recall scores are both higher than the previous models
Reviewing Feature Importance
Even though the models are complex, theGridSearchCV
instance has a feature_importances_
property which can be used to learn about how much "importance" the model considers each feature in the dataset.In [63]:
importances = rf_estimator_weighted.feature_importances_
indices = np.argsort(importances)
feature_names = list(X.columns)
print(f'importances: {importances}')
print(f'indices: {indices}')
print(f'feature_names: {feature_names}')
In [64]:
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
In [65]:
allmodels = [bagging_estimator,bagging_estimator_tuned,bagging_lr,rf_estimator,rf_estimator_tuned,
rf_estimator_weighted]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
# looping through all the models to get the accuracy, precall and precision scores
for model in allmodels:
j = get_metrics_score(model,False)
acc_train.append(np.round(j[0],2))
acc_test.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_test.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_test.append(np.round(j[5],2))
In [68]:
comparison_frame = pd.DataFrame({
'Model':['Bagging (default)',
'Bagging Tuned',
'Bagging + base_estimator=LR',
'Random Forest (default)',
'Random Forest Tuned',
'Random Forest + class_weights'
],
'Train_Accuracy': acc_train,
'Test_Accuracy': acc_test,
'Train_Recall':recall_train,
'Test_Recall':recall_test,
'Train_Precision':precision_train,
'Test_Precision':precision_test
})
comparison_frame
Out [68]:
Model-Building Take-Aways
- Hyperparameter tuning seems complex. Testing hyperparameter impacts on a model's performance requires building a model with each hyperparameter variation.- Untested parameters values and/or combinations leaves unknown model results out of consideration.
Page Tags:
python
data-science
jupyter
learning
numpy