import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from IPython.display import Image
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import roc_curve, auc, average_precision_score
train = pd.read_excel('lendingclub_traindata.xlsx')
validation=pd.read_excel('lendingclub_valdata.xlsx')
test=pd.read_excel('lendingclub_testdata.xlsx')
# 1 = good, 0 = default
print(train.head())
print("----------------------")
print(validation.head())
print("----------------------")
print(test.head())
# remove target column to create feature only dataset
X_train = train.drop('loan_status',axis=1)
X_val=validation.drop('loan_status',axis=1)
X_test=test.drop('loan_status',axis=1)
# store target column
y_train = train['loan_status']
y_val=validation['loan_status']
y_test=test['loan_status']
print(X_train.shape, y_train.shape, X_val.shape,y_val.shape,X_test.shape,y_test.shape)
X_train.columns
clf = DecisionTreeClassifier(criterion='entropy',max_depth=4,min_samples_split=1000,min_samples_leaf=200,random_state=0)
clf = clf.fit(X_train,y_train)
fig, ax = plt.subplots(figsize=(40, 30))
plot_tree(clf, filled=True, feature_names=X_train.columns, proportion=True)
plt.show()
y_train_pred = clf.predict_proba(X_train)
y_val_pred=clf.predict_proba(X_val)
y_test_pred=clf.predict_proba(X_test)
print (y_train_pred)
# Calculate maximum likelihood for training set, validation set, and test set
mle_vector_train = np.log(np.where(y_train == 1, y_train_pred[:,1], y_train_pred[:,0]))
mle_vector_val = np.log(np.where(y_val == 1, y_val_pred[:,1], y_val_pred[:,0]))
mle_vector_test = np.log(np.where(y_test == 1, y_test_pred[:,1], y_test_pred[:,0]))
# Calculate cost functions from maximum likelihoods
cost_function_training=np.negative(np.sum(mle_vector_train)/len(y_train))
cost_function_val=np.negative(np.sum(mle_vector_val)/len(y_val))
cost_function_test=np.negative(np.sum(mle_vector_test)/len(y_test))
print (y_train_pred)
print('cost function training set =', cost_function_training)
print('cost function validation set =', cost_function_val)
print('cost function test set =', cost_function_test)
THRESHOLD = [.75, .80, .85]
results = pd.DataFrame(columns=["THRESHOLD", "accuracy", "true pos rate", "true neg rate", "false pos rate", "precision", "f-score"]) # df to store results
results['THRESHOLD'] = THRESHOLD # threshold column
n_test = len(y_test)
Q = clf.predict_proba(X_test)[:,1]
j = 0
for i in THRESHOLD: # iterate over each threshold
# fit data to model
preds = np.where(Q>i, 1, 0) # if prob > threshold, predict 1
cm = (confusion_matrix(y_test, preds,labels=[1, 0], sample_weight=None)/n_test)*100
# confusion matrix (in percentage)
print('Confusion matrix for threshold =',i)
print(cm)
print(' ')
TP = cm[0][0] # True Positives
FN = cm[0][1] # False Positives
FP = cm[1][0] # True Negatives
TN = cm[1][1] # False Negatives
results.iloc[j,1] = accuracy_score(y_test, preds)
results.iloc[j,2] = recall_score(y_test, preds)
results.iloc[j,3] = TN/(FP+TN) # True negative rate
results.iloc[j,4] = FP/(FP+TN) # False positive rate
results.iloc[j,5] = precision_score(y_test, preds)
results.iloc[j,6] = f1_score(y_test, preds)
j += 1
print('ALL METRICS')
print(results.T.to_string(header=False))
# Compute the ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, Q)
roc_auc = auc(fpr,tpr)
plt.figure(figsize=(8,6)) # format the plot size
lw = 1.5
plt.plot(fpr, tpr, color='darkorange', marker='.',
lw=lw, label='Decision Tree (AUC = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--',
label='Random Prediction (AUC = 0.5)' )
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()