In [193]:
# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse
Loading data.
In [194]:
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv') 
In [195]:
# First 1800 data items are training set; the next 600 are the validation set: the final 508 are the etst set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]
test=data.iloc[2400:2908]
In [196]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val,X_test = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1), test.drop('Sale Price', axis=1)
y_train, y_val,y_test = train[['Sale Price']], val[['Sale Price']], test[['Sale Price']] 
Linear Regression
In [197]:
# Importing models
from sklearn.linear_model import LinearRegression
In [198]:
lr=LinearRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_train)
print(mse(y_train,pred))
pred=lr.predict(X_val)
print(mse(y_val,pred))
0.1140152643124634
0.11702499460121657
In [199]:
# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)
coeffs
Out[199]:
1
0
intercept -3.06336e-11
LotArea 0.0789996
OverallQual 0.214395
OverallCond 0.0964787
YearBuilt 0.160799
YearRemodAdd 0.0253524
BsmtFinSF1 0.0914664
BsmtUnfSF -0.0330798
TotalBsmtSF 0.138199
1stFlrSF 0.152786
2ndFlrSF 0.132765
GrLivArea 0.161303
FullBath -0.0208076
HalfBath 0.0171941
BedroomAbvGr -0.0835202
TotRmsAbvGrd 0.0832203
Fireplaces 0.0282578
GarageCars 0.0379971
GarageArea 0.0518093
WoodDeckSF 0.0208337
OpenPorchSF 0.0340982
EnclosedPorch 0.00682223
Blmngtn -0.0184305
Blueste -0.0129214
BrDale -0.0246262
BrkSide 0.0207618
ClearCr -0.00737828
CollgCr -0.00675362
Crawfor 0.0363235
Edwards -0.000690065
Gilbert -0.00834022
IDOTRR -0.00153683
MeadowV -0.016418
Mitchel -0.0284821
Names -0.0385057
NoRidge 0.0515626
NPkVill -0.0219519
NriddgHt 0.12399
NWAmes -0.0517591
OLDTown -0.026499
SWISU -0.00414298
Sawyer -0.0181341
SawyerW -0.0282754
Somerst 0.0275063
StoneBr 0.0630586
Timber -0.00276173
Veenker 0.00240311
Bsmt Qual 0.0113115
Ridge Regression
In [200]:
# Importing Ridge
from sklearn.linear_model import Ridge
In [201]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.6*1800, 1.0*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))
0.11703284346091346
0.11710797319752994
0.11723952924901127
0.11741457158889525
0.11762384068711469
0.11825709631198024
0.11900057469147929
0.12254649996292954
0.13950882506243753
0.15786905738744897
In [202]:
plt.plot(alphas, mses)
Out[202]:
[<matplotlib.lines.Line2D at 0x160ebc0a4c8>]
Lasso
In [203]:
# Import Lasso
from sklearn.linear_model import Lasso
In [204]:
# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)
Out[204]:
Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
In [205]:
# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs
Out[205]:
1
0
intercept -1.25303e-11
LotArea 0.0443042
OverallQual 0.298079
OverallCond 0
YearBuilt 0.0520907
YearRemodAdd 0.0644712
BsmtFinSF1 0.115875
BsmtUnfSF -0
TotalBsmtSF 0.10312
1stFlrSF 0.0322946
2ndFlrSF 0
GrLivArea 0.297065
FullBath 0
HalfBath 0
BedroomAbvGr -0
TotRmsAbvGrd 0
Fireplaces 0.0204043
GarageCars 0.027512
GarageArea 0.0664096
WoodDeckSF 0.00102883
OpenPorchSF 0.00215018
EnclosedPorch -0
Blmngtn -0
Blueste -0
BrDale -0
BrkSide 0
ClearCr 0
CollgCr -0
Crawfor 0
Edwards -0
Gilbert 0
IDOTRR -0
MeadowV -0
Mitchel -0
Names -0
NoRidge 0.013209
NPkVill -0
NriddgHt 0.0842993
NWAmes -0
OLDTown -0
SWISU -0
Sawyer -0
SawyerW -0
Somerst 0
StoneBr 0.0168153
Timber 0
Veenker 0
Bsmt Qual 0.0202754
Lasso with different levels of alpha and its mse
In [206]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))
0.11654751909608793
0.11682687945311092
0.11803348353132027
0.12012836764958999
0.12301536903084047
0.13178576395045638
0.14017194584483775
In [207]:
plt.plot(alphas, mses)
Out[207]:
[<matplotlib.lines.Line2D at 0x160eceba988>]
In [208]:
# Calculate mse for test set when Hull's lambda =0.04
alpha=0.04/2
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_test)
print(mse(y_test,pred))
0.12541988653891775
In [209]:
# Calculate mse for test set when Hull's lambda =0.1
alpha=0.1/2
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_test)
print(mse(y_test,pred))
0.14720538902033128