# import packages
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, auc, average_precision_score
train = pd.read_excel('lendingclub_traindata.xlsx')
test = pd.read_excel('lendingclub_testdata.xlsx')
# 1 = good, 0 = default
#give column names
cols = ['home_ownership', 'income', 'dti', 'fico', 'loan_status']
train.columns = validation.columns=test.columns = cols
print (validation.head())
The data has already been split into training set, validation set, and test set. There are 7000 instances of the training set, 3000 instances of the validation set and 2290 instances of the test set. The four features have been labeled as: home ownership, income, dti and fico.
# remove target column to create feature only dataset
X_train = train.drop('loan_status', 1)
X_val=validation.drop('loan_status', 1)
X_test = test.drop('loan_status', 1)
# Scale data using the mean and standard deviation of the training set.
# This is not necessary for the simple logistic regression we will do here
# but should be done if L1 or L2 regrularization is carried out
# store target column as y-variables
y_train = train['loan_status']
y_test = test['loan_status']
#print first five instances for each data set
print(X_train.shape, y_train.shape, X_val.shape,y_val.shape, X_test.shape, y_test.shape)
freq = y_train.value_counts() # count frequency of different classes in training swet
freq/sum(freq)*100 # get percentage of above
#Create an ionstance of logisticregression named lgstc_reg
lgstc_reg = LogisticRegression(penalty="none",solver="newton-cg")
# Fit logististic regression to training set
lgstc_reg.fit(X_train, y_train) # fit training data on logistic regression
print(lgstc_reg.intercept_, lgstc_reg.coef_) # get the coefficients of each features
When used on scaled data the model has a bias of 1.416 and coefficients of 0.145, 0.034, -0.324 and 0.363. We now test the model on the validation set.
# y_train_pred, y_val_pred, and y_test_pred are the predicted probabilities for the training set
# validation set and test set using the fitted logistic regression model
# Calculate maximum likelihood for training set, validation set, and test set
mle_vector_train = np.log(np.where(y_train == 1, y_train_pred[:,1], y_train_pred[:,0]))
mle_vector_val = np.log(np.where(y_val == 1, y_val_pred[:,1], y_val_pred[:,0]))
mle_vector_test = np.log(np.where(y_test == 1, y_test_pred[:,1], y_test_pred[:,0]))
# Calculate cost functions from maximum likelihoods
print('cost function training set =', cost_function_training)
print('cost function validation set =', cost_function_val)
print('cost function test set =', cost_function_test)
An analyst must decide on a criterion for predicting whether loan will be good or default. This involves specifying a threshold By default this threshold is set to 0.5, i.e., loans are separated into good and bad categories according to whether the probability of no default is greater or less than 0.5. However this does not work well for an imbalanced data set such as this. It would predict that all loans are good! We will look at the results for few other thresholds.
THRESHOLD = [.75, .80, .85]
# Create dataframe to store resultd
results = pd.DataFrame(columns=["THRESHOLD", "accuracy", "true pos rate", "true neg rate", "false pos rate", "precision", "f-score"]) # df to store results
# Create threshold row
j = 0
# Iterate over the 3 thresholds
for i in THRESHOLD:
#lgstc_reg.fit(X_train, y_train)
# If prob for test set > threshold predict 1
preds = np.where(lgstc_reg.predict_proba(X_test)[:,1] > i, 1, 0)
# create confusion matrix
cm = (confusion_matrix(y_test, preds,labels=[1, 0], sample_weight=None) / len(y_test))*100 # confusion matrix (in percentage)
print('Confusion matrix for threshold =',i)
print(' ')
TP = cm[0][0] # True Positives
FN = cm[0][1] # False Positives
FP = cm[1][0] # True Negatives
TN = cm[1][1] # False Negatives
results.iloc[j,1] = accuracy_score(y_test, preds)
results.iloc[j,2] = recall_score(y_test, preds)
results.iloc[j,3] = TN/(FP+TN) # True negative rate
results.iloc[j,4] = FP/(FP+TN) # False positive rate
results.iloc[j,5] = precision_score(y_test, preds)
results.iloc[j,6] = f1_score(y_test, preds)
j += 1
print('ALL METRICS')
print( results.T)
This table shows that there is a trade off betwee the true positive rate and the false positive rate.
We can improve the percentage of good loans we identify only by increasing the percentage of bad that are misclassified.
The receiver operating curve (ROC) captures this trade off by considering different thresholds.
# Calculate the receiver operating curve and the AUC measure
lr_prob=lr_prob[:, 1]
ns_prob=[0 for _ in range(len(y_test))]
ns_auc=roc_auc_score(y_test, ns_prob)
print("AUC random predictions =", ns_auc)
print("AUC predictions from logistic regression model =", lr_auc)
plt.plot(ns_fpr,ns_tpr,linestyle='--',label='Random Predction')
plt.plot(lr_fpr,lr_tpr,marker='.',label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')