In [17]:
# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse
Loading data.
In [18]:
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv') 
In [19]:
# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]
In [20]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']] 
Linear Regression
In [21]:
# Importing models
from sklearn.linear_model import LinearRegression
In [22]:
lr=LinearRegression()
lr.fit(X_train,y_train)
Out[22]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [23]:
# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)
coeffs
Out[23]:
1
0
intercept -0.0135714
LotArea 0.0675036
OverallQual 0.214606
OverallCond 0.0954381
YearBuilt 0.16072
YearRemodAdd 0.0253491
BsmtFinSF1 0.0901037
BsmtUnfSF -0.0324273
TotalBsmtSF 0.137416
1stFlrSF 0.151475
2ndFlrSF 0.130551
GrLivArea 0.15802
FullBath -0.0207461
HalfBath 0.016897
BedroomAbvGr -0.083759
TotRmsAbvGrd 0.0804692
Fireplaces 0.0279582
GarageCars 0.0384211
GarageArea 0.0522098
WoodDeckSF 0.0208242
OpenPorchSF 0.0346549
EnclosedPorch 0.00702863
Blmngtn -0.0174231
Blueste -0.0158862
BrDale -0.0234791
BrkSide 0.0203627
ClearCr -0.00676113
CollgCr -0.00687137
Crawfor 0.0383282
Edwards -0.000757581
Gilbert -0.00804272
IDOTRR -0.00182079
MeadowV -0.0169572
Mitchel -0.0299928
Names -0.036814
NoRidge 0.0485983
NPkVill -0.019885
NriddgHt 0.119826
NWAmes -0.0484639
OLDTown -0.0271789
SWISU -0.00391513
Sawyer -0.0185853
SawyerW -0.0277074
Somerst 0.0262406
StoneBr 0.0649378
Timber -0.00287523
Veenker 0.00234418
Bsmt Qual 0.0116892
Ridge Regression
In [24]:
# Importing Ridge
from sklearn.linear_model import Ridge
In [25]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))
0.11430912602818623
0.11438831890699627
0.11452201329565978
0.11469744977061372
0.1149055753248758
0.11553053137615597
0.11625996465875431
0.11972299505022363
0.12773729057198405
In [26]:
plt.plot(alphas, mses)
Out[26]:
[<matplotlib.lines.Line2D at 0x18cebb79c08>]
Lasso
In [27]:
# Import Lasso
from sklearn.linear_model import Lasso
In [28]:
# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)
Out[28]:
Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
In [29]:
# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs
Out[29]:
1
0
intercept -0.0129255
LotArea 0.0442383
OverallQual 0.300162
OverallCond 0
YearBuilt 0.0542006
YearRemodAdd 0.0647997
BsmtFinSF1 0.114129
BsmtUnfSF -0
TotalBsmtSF 0.105414
1stFlrSF 0.0283594
2ndFlrSF 0
GrLivArea 0.29196
FullBath 0
HalfBath 0
BedroomAbvGr -0
TotRmsAbvGrd 0
Fireplaces 0.01876
GarageCars 0.0262154
GarageArea 0.0656378
WoodDeckSF 0
OpenPorchSF 0.000220707
EnclosedPorch -0
Blmngtn -0
Blueste -0
BrDale -0
BrkSide 0
ClearCr 0
CollgCr -0
Crawfor 0
Edwards -0
Gilbert 0
IDOTRR -0
MeadowV -0
Mitchel -0
Names -0
NoRidge 0.0143427
NPkVill -0
NriddgHt 0.0835168
NWAmes -0
OLDTown -0
SWISU -0
Sawyer -0
SawyerW -0
Somerst 0
StoneBr 0.0150803
Timber 0
Veenker 0
Bsmt Qual 0.0156601
Lasso with different levels of alpha and its mse
In [30]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))
0.11386863624693959
0.11411666655734329
0.11526461827765332
0.11732406234518182
0.12020611819731855
0.12921610751419008
0.13745407671806387
In [31]:
plt.plot(alphas, mses)
Out[31]:
[<matplotlib.lines.Line2D at 0x18ce8ee25c8>]
In [ ]: