# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse

Loading data.¶

# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv')

# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]

# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']]

Linear Regression¶

# Importing models
from sklearn.linear_model import LinearRegression

lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)
coeffs

Ridge Regression¶

# Importing Ridge
from sklearn.linear_model import Ridge

# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))

0.11430912602818623
0.11438831890699627
0.11452201329565978
0.11469744977061372
0.1149055753248758
0.11553053137615597
0.11625996465875431
0.11972299505022363
0.12773729057198405

plt.plot(alphas, mses)

[<matplotlib.lines.Line2D at 0x18cebb79c08>]

Lasso¶

# Import Lasso
from sklearn.linear_model import Lasso

# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs

Lasso with different levels of alpha and its mse¶

# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))

0.11386863624693959
0.11411666655734329
0.11526461827765332
0.11732406234518182
0.12020611819731855
0.12921610751419008
0.13745407671806387

plt.plot(alphas, mses)

[<matplotlib.lines.Line2D at 0x18ce8ee25c8>]

	1
0
intercept	-0.0135714
LotArea	0.0675036
OverallQual	0.214606
OverallCond	0.0954381
YearBuilt	0.16072
YearRemodAdd	0.0253491
BsmtFinSF1	0.0901037
BsmtUnfSF	-0.0324273
TotalBsmtSF	0.137416
1stFlrSF	0.151475
2ndFlrSF	0.130551
GrLivArea	0.15802
FullBath	-0.0207461
HalfBath	0.016897
BedroomAbvGr	-0.083759
TotRmsAbvGrd	0.0804692
Fireplaces	0.0279582
GarageCars	0.0384211
GarageArea	0.0522098
WoodDeckSF	0.0208242
OpenPorchSF	0.0346549
EnclosedPorch	0.00702863
Blmngtn	-0.0174231
Blueste	-0.0158862
BrDale	-0.0234791
BrkSide	0.0203627
ClearCr	-0.00676113
CollgCr	-0.00687137
Crawfor	0.0383282
Edwards	-0.000757581
Gilbert	-0.00804272
IDOTRR	-0.00182079
MeadowV	-0.0169572
Mitchel	-0.0299928
Names	-0.036814
NoRidge	0.0485983
NPkVill	-0.019885
NriddgHt	0.119826
NWAmes	-0.0484639
OLDTown	-0.0271789
SWISU	-0.00391513
Sawyer	-0.0185853
SawyerW	-0.0277074
Somerst	0.0262406
StoneBr	0.0649378
Timber	-0.00287523
Veenker	0.00234418
Bsmt Qual	0.0116892

	1
0
intercept	-0.0129255
LotArea	0.0442383
OverallQual	0.300162
OverallCond	0
YearBuilt	0.0542006
YearRemodAdd	0.0647997
BsmtFinSF1	0.114129
BsmtUnfSF	-0
TotalBsmtSF	0.105414
1stFlrSF	0.0283594
2ndFlrSF	0
GrLivArea	0.29196
FullBath	0
HalfBath	0
BedroomAbvGr	-0
TotRmsAbvGrd	0
Fireplaces	0.01876
GarageCars	0.0262154
GarageArea	0.0656378
WoodDeckSF	0
OpenPorchSF	0.000220707
EnclosedPorch	-0
Blmngtn	-0
Blueste	-0
BrDale	-0
BrkSide	0
ClearCr	0
CollgCr	-0
Crawfor	0
Edwards	-0
Gilbert	0
IDOTRR	-0
MeadowV	-0
Mitchel	-0
Names	-0
NoRidge	0.0143427
NPkVill	-0
NriddgHt	0.0835168
NWAmes	-0
OLDTown	-0
SWISU	-0
Sawyer	-0
SawyerW	-0
Somerst	0
StoneBr	0.0150803
Timber	0
Veenker	0
Bsmt Qual	0.0156601