# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse

Loading data.¶

# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv')

# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]

# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']]

Linear Regression¶

# Importing models
from sklearn.linear_model import LinearRegression

lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)
coeffs

Ridge Regression¶

# Importing Ridge
from sklearn.linear_model import Ridge

# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))

0.11430912602818623
0.11438831890699627
0.11452201329565981
0.11469744977061369
0.11490557532487583
0.11553053137615601
0.1162599646587543
0.11972299505022367
0.1277372905719841

plt.plot(alphas, mses)

[<matplotlib.lines.Line2D at 0x20643566438>]

Lasso¶

# Import Lasso
from sklearn.linear_model import Lasso

# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs

Lasso with different levels of alpha and its mse¶

# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))

0.11386863624693959
0.11411666655734329
0.11526461827765332
0.11732406234518182
0.12020611819731855
0.12921610751419008
0.13745407671806387

plt.plot(alphas, mses)

[<matplotlib.lines.Line2D at 0x2064615ecf8>]

	1
0
intercept	-0.0136147
LotArea	0.0674861
OverallQual	0.214642
OverallCond	0.0954426
YearBuilt	0.16075
YearRemodAdd	0.0252969
BsmtFinSF1	0.0899544
BsmtUnfSF	-0.0325298
TotalBsmtSF	0.13748
1stFlrSF	0.151673
2ndFlrSF	0.130767
GrLivArea	0.157736
FullBath	-0.0207088
HalfBath	0.016989
BedroomAbvGr	-0.083825
TotRmsAbvGrd	0.0804489
Fireplaces	0.0280662
GarageCars	0.0383846
GarageArea	0.0523007
WoodDeckSF	0.0208165
OpenPorchSF	0.0345543
EnclosedPorch	0.00709801
Blmngtn	-3.55804e+10
Blueste	-2.13297e+10
BrDale	-3.68164e+10
BrkSide	-6.85937e+10
ClearCr	-4.44784e+10
CollgCr	-1.05215e+11
Crawfor	-6.70316e+10
Edwards	-9.02604e+10
Gilbert	-8.42928e+10
IDOTRR	-6.30997e+10
MeadowV	-4.0837e+10
Mitchel	-7.07134e+10
Names	-1.30932e+11
NoRidge	-5.54553e+10
NPkVill	-3.22755e+10
NriddgHt	-8.42928e+10
NWAmes	-7.55717e+10
OLDTown	-1.00071e+11
SWISU	-4.64237e+10
Sawyer	-8.08431e+10
SawyerW	-7.39005e+10
Somerst	-8.8254e+10
StoneBr	-4.78273e+10
Timber	-5.66181e+10
Veenker	-3.29639e+10
Bsmt Qual	0.0116689

	1
0
intercept	-0.0129255
LotArea	0.0442383
OverallQual	0.300162
OverallCond	0
YearBuilt	0.0542006
YearRemodAdd	0.0647997
BsmtFinSF1	0.114129
BsmtUnfSF	-0
TotalBsmtSF	0.105414
1stFlrSF	0.0283594
2ndFlrSF	0
GrLivArea	0.29196
FullBath	0
HalfBath	0
BedroomAbvGr	-0
TotRmsAbvGrd	0
Fireplaces	0.01876
GarageCars	0.0262154
GarageArea	0.0656378
WoodDeckSF	0
OpenPorchSF	0.000220707
EnclosedPorch	-0
Blmngtn	-0
Blueste	-0
BrDale	-0
BrkSide	0
ClearCr	0
CollgCr	-0
Crawfor	0
Edwards	-0
Gilbert	0
IDOTRR	-0
MeadowV	-0
Mitchel	-0
Names	-0
NoRidge	0.0143427
NPkVill	-0
NriddgHt	0.0835168
NWAmes	-0
OLDTown	-0
SWISU	-0
Sawyer	-0
SawyerW	-0
Somerst	0
StoneBr	0.0150803
Timber	0
Veenker	0
Bsmt Qual	0.0156601