# This notebook carries out calculations for the example Table 5.1 of Chapter 5 when the adjusted credit score 
# for the second loan is 140 instead of 30 and the income for the eighth loan is 60 rather than 95.
# Early printings of the book do not use this example. As indicated in the Errata they mistakenly calculate
# the misclassified loans with reference to the outer edges of the path rather than with reference to the middle of the path.

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

Income = pd.DataFrame([30, 55, 63, 35, 28, 140, 100, 60, 64, 63])
Credit = pd.DataFrame([40, 140, 30, 80, 100, 30, 30, 90, 120, 150])
Loan = pd.DataFrame([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([0,160])
axes.set_ylim([0,None])
plt.show()

Note that the objective function of SVC is $C\sum_{j=1}^n z_j+\frac{1}{2}\sum_{j=1}^nw_j^2$. If our objective function is $C\sum_{j=1}^n z_j+\sum_{j=1}^nw_j^2$, then we to set $C=C/2$ in SVC.¶

X = np.asarray(pd.concat([Income, Credit],axis=1))
y = np.asarray(Loan).ravel()
clf = SVC(kernel='linear',C=0.005,tol=1e-5)
clf.fit(X,y)

SVC(C=0.005, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=1e-05, verbose=False)

w = clf.coef_[0]
b = -clf.intercept_[0]
print(w)
print(b)

[0.05405405 0.02162153]
5.0540482894429495

The outer lines are $w_1x_1+w_2x_2 = b_u$ and $w_1x_1+w_2x_2=b_d$. The middle line is $w_1x_1+w_2x_2=b$, where $b_u=b+1$ and $b_d=b-1$. The width of the path is $\frac{2}{\sqrt{w_1^2+w_2^2}}$.¶

x1 = np.linspace(0,160,100)
w1 = w[0]
w2 = w[1]
bu = b+1
bd = b-1
y1 = (bu-w1*x1)/w2
y2 = (bd-w1*x1)/w2
y0 = (b-w1*x1)/w2
plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.plot(x1,y1,'--',color='skyblue')
plt.plot(x1,y2,'--',color='skyblue')
plt.plot(x1,y0,'-')
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([0,160])
axes.set_ylim([0,None])
plt.show()

For different values of $C$ and find out the loans misclassified as well as width of pathway¶

for C in [0.01, 0.002, 0.001, 0.0005, 0.0003, 0.0002, 0.0001]:
    clf = SVC(kernel='linear',C=C/2,tol=1e-6)
    clf.fit(X,y)
    S = clf.score(X,y)
    w = clf.coef_[0]
    b = -clf.intercept_[0]
    P = 2/np.sqrt(w[0]**2+w[1]**2)
    print("C = %6.4f, w1 = %6.4f, w2 = %6.4f, b = %5.2f, Loan Misclassified = %3.0f%%, Width = %5.1f" %(C,w[0],w[1],b,100*(1-S),P))

C = 0.0100, w1 = 0.0541, w2 = 0.0216, b =  5.05, Loan Misclassified =  10%, Width =  34.4
C = 0.0020, w1 = 0.0409, w2 = 0.0132, b =  3.48, Loan Misclassified =  10%, Width =  46.6
C = 0.0010, w1 = 0.0397, w2 = 0.0122, b =  3.33, Loan Misclassified =  10%, Width =  48.2
C = 0.0005, w1 = 0.0265, w2 = 0.0100, b =  2.46, Loan Misclassified =  10%, Width =  70.6
C = 0.0003, w1 = 0.0187, w2 = 0.0057, b =  1.79, Loan Misclassified =  20%, Width = 102.2
C = 0.0002, w1 = 0.0185, w2 = 0.0033, b =  1.69, Loan Misclassified =  30%, Width = 106.6
C = 0.0001, w1 = 0.0108, w2 = 0.0015, b =  0.97, Loan Misclassified =  30%, Width = 183.4

This shows that for $C=0.0001$, 30% of the loans are not correctly classified¶

clf = SVC(kernel='linear',C=0.00005,tol=1e-5)
clf.fit(X,y)
w = clf.coef_[0]
b = -clf.intercept_[0]
print(w)
print(b)

[0.0108 0.0015]
0.9705000000000001

x1 = np.linspace(-20,180,100)
w1 = w[0]
w2 = w[1]
bu = b+1
bd = b-1
y1 = (bu-w1*x1)/w2
y2 = (bd-w1*x1)/w2
y = (b-w1*x1)/w2
plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.plot(x1,y1,'--',color='skyblue')
plt.plot(x1,y2,'--',color='skyblue')
plt.plot(x1,y,'-')
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([-20,180])
axes.set_ylim([0,None])
plt.show()