In [41]:
# This notebook carries out calculations for the example Table 5.1 of Chapter 5 when the adjusted credit score 
# for the second loan is 140 instead of 30 and the income for the eighth loan is 60 rather than 95.
# Early printings of the book do not use this example. As indicated in the Errata they mistakenly calculate
# the misclassified loans with reference to the outer edges of the path rather than with reference to the middle of the path.
In [42]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
In [43]:
Income = pd.DataFrame([30, 55, 63, 35, 28, 140, 100, 60, 64, 63])
Credit = pd.DataFrame([40, 140, 30, 80, 100, 30, 30, 90, 120, 150])
Loan = pd.DataFrame([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
In [44]:
plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([0,160])
axes.set_ylim([0,None])
plt.show()

Note that the objective function of SVC is $C\sum_{j=1}^n z_j+\frac{1}{2}\sum_{j=1}^nw_j^2$. If our objective function is $C\sum_{j=1}^n z_j+\sum_{j=1}^nw_j^2$, then we to set $C=C/2$ in SVC.

In [45]:
X = np.asarray(pd.concat([Income, Credit],axis=1))
y = np.asarray(Loan).ravel()
clf = SVC(kernel='linear',C=0.005,tol=1e-5)
clf.fit(X,y)
Out[45]:
SVC(C=0.005, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=1e-05, verbose=False)
In [46]:
w = clf.coef_[0]
b = -clf.intercept_[0]
print(w)
print(b)
[0.05405405 0.02162153]
5.0540482894429495

The outer lines are $w_1x_1+w_2x_2 = b_u$ and $w_1x_1+w_2x_2=b_d$. The middle line is $w_1x_1+w_2x_2=b$, where $b_u=b+1$ and $b_d=b-1$. The width of the path is $\frac{2}{\sqrt{w_1^2+w_2^2}}$.

In [47]:
x1 = np.linspace(0,160,100)
w1 = w[0]
w2 = w[1]
bu = b+1
bd = b-1
y1 = (bu-w1*x1)/w2
y2 = (bd-w1*x1)/w2
y0 = (b-w1*x1)/w2
plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.plot(x1,y1,'--',color='skyblue')
plt.plot(x1,y2,'--',color='skyblue')
plt.plot(x1,y0,'-')
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([0,160])
axes.set_ylim([0,None])
plt.show()

For different values of $C$ and find out the loans misclassified as well as width of pathway

In [48]:
for C in [0.01, 0.002, 0.001, 0.0005, 0.0003, 0.0002, 0.0001]:
    clf = SVC(kernel='linear',C=C/2,tol=1e-6)
    clf.fit(X,y)
    S = clf.score(X,y)
    w = clf.coef_[0]
    b = -clf.intercept_[0]
    P = 2/np.sqrt(w[0]**2+w[1]**2)
    print("C = %6.4f, w1 = %6.4f, w2 = %6.4f, b = %5.2f, Loan Misclassified = %3.0f%%, Width = %5.1f" %(C,w[0],w[1],b,100*(1-S),P))
C = 0.0100, w1 = 0.0541, w2 = 0.0216, b =  5.05, Loan Misclassified =  10%, Width =  34.4
C = 0.0020, w1 = 0.0409, w2 = 0.0132, b =  3.48, Loan Misclassified =  10%, Width =  46.6
C = 0.0010, w1 = 0.0397, w2 = 0.0122, b =  3.33, Loan Misclassified =  10%, Width =  48.2
C = 0.0005, w1 = 0.0265, w2 = 0.0100, b =  2.46, Loan Misclassified =  10%, Width =  70.6
C = 0.0003, w1 = 0.0187, w2 = 0.0057, b =  1.79, Loan Misclassified =  20%, Width = 102.2
C = 0.0002, w1 = 0.0185, w2 = 0.0033, b =  1.69, Loan Misclassified =  30%, Width = 106.6
C = 0.0001, w1 = 0.0108, w2 = 0.0015, b =  0.97, Loan Misclassified =  30%, Width = 183.4

This shows that for $C=0.0001$, 30% of the loans are not correctly classified

In [49]:
clf = SVC(kernel='linear',C=0.00005,tol=1e-5)
clf.fit(X,y)
w = clf.coef_[0]
b = -clf.intercept_[0]
print(w)
print(b)
[0.0108 0.0015]
0.9705000000000001
In [50]:
x1 = np.linspace(-20,180,100)
w1 = w[0]
w2 = w[1]
bu = b+1
bd = b-1
y1 = (bu-w1*x1)/w2
y2 = (bd-w1*x1)/w2
y = (b-w1*x1)/w2
plt.figure(figsize=(10,8))
plt.scatter(Income, Credit, c=Loan)
plt.plot(x1,y1,'--',color='skyblue')
plt.plot(x1,y2,'--',color='skyblue')
plt.plot(x1,y,'-')
plt.xlabel('Income')
plt.ylabel('Adjusted Credit Score')
axes = plt.gca()
axes.set_xlim([-20,180])
axes.set_ylim([0,None])
plt.show()
In [ ]:
 
In [ ]: