In [1]:
# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse
Loading data.
In [2]:
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv') 
In [3]:
# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]
In [4]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']] 
Linear Regression
In [5]:
# Importing models
from sklearn.linear_model import LinearRegression
In [6]:
lr=LinearRegression()
lr.fit(X_train,y_train)
Out[6]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [7]:
# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)
coeffs
Out[7]:
1
0
intercept -0.0136147
LotArea 0.0674861
OverallQual 0.214642
OverallCond 0.0954426
YearBuilt 0.16075
YearRemodAdd 0.0252969
BsmtFinSF1 0.0899544
BsmtUnfSF -0.0325298
TotalBsmtSF 0.13748
1stFlrSF 0.151673
2ndFlrSF 0.130767
GrLivArea 0.157736
FullBath -0.0207088
HalfBath 0.016989
BedroomAbvGr -0.083825
TotRmsAbvGrd 0.0804489
Fireplaces 0.0280662
GarageCars 0.0383846
GarageArea 0.0523007
WoodDeckSF 0.0208165
OpenPorchSF 0.0345543
EnclosedPorch 0.00709801
Blmngtn -3.55804e+10
Blueste -2.13297e+10
BrDale -3.68164e+10
BrkSide -6.85937e+10
ClearCr -4.44784e+10
CollgCr -1.05215e+11
Crawfor -6.70316e+10
Edwards -9.02604e+10
Gilbert -8.42928e+10
IDOTRR -6.30997e+10
MeadowV -4.0837e+10
Mitchel -7.07134e+10
Names -1.30932e+11
NoRidge -5.54553e+10
NPkVill -3.22755e+10
NriddgHt -8.42928e+10
NWAmes -7.55717e+10
OLDTown -1.00071e+11
SWISU -4.64237e+10
Sawyer -8.08431e+10
SawyerW -7.39005e+10
Somerst -8.8254e+10
StoneBr -4.78273e+10
Timber -5.66181e+10
Veenker -3.29639e+10
Bsmt Qual 0.0116689
Ridge Regression
In [8]:
# Importing Ridge
from sklearn.linear_model import Ridge
In [9]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))
0.11430912602818623
0.11438831890699627
0.11452201329565981
0.11469744977061369
0.11490557532487583
0.11553053137615601
0.1162599646587543
0.11972299505022367
0.1277372905719841
In [10]:
plt.plot(alphas, mses)
Out[10]:
[<matplotlib.lines.Line2D at 0x20643566438>]
Lasso
In [11]:
# Import Lasso
from sklearn.linear_model import Lasso
In [12]:
# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)
Out[12]:
Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
In [13]:
# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs
Out[13]:
1
0
intercept -0.0129255
LotArea 0.0442383
OverallQual 0.300162
OverallCond 0
YearBuilt 0.0542006
YearRemodAdd 0.0647997
BsmtFinSF1 0.114129
BsmtUnfSF -0
TotalBsmtSF 0.105414
1stFlrSF 0.0283594
2ndFlrSF 0
GrLivArea 0.29196
FullBath 0
HalfBath 0
BedroomAbvGr -0
TotRmsAbvGrd 0
Fireplaces 0.01876
GarageCars 0.0262154
GarageArea 0.0656378
WoodDeckSF 0
OpenPorchSF 0.000220707
EnclosedPorch -0
Blmngtn -0
Blueste -0
BrDale -0
BrkSide 0
ClearCr 0
CollgCr -0
Crawfor 0
Edwards -0
Gilbert 0
IDOTRR -0
MeadowV -0
Mitchel -0
Names -0
NoRidge 0.0143427
NPkVill -0
NriddgHt 0.0835168
NWAmes -0
OLDTown -0
SWISU -0
Sawyer -0
SawyerW -0
Somerst 0
StoneBr 0.0150803
Timber 0
Veenker 0
Bsmt Qual 0.0156601
Lasso with different levels of alpha and its mse
In [14]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))
0.11386863624693959
0.11411666655734329
0.11526461827765332
0.11732406234518182
0.12020611819731855
0.12921610751419008
0.13745407671806387
In [15]:
plt.plot(alphas, mses)
Out[15]:
[<matplotlib.lines.Line2D at 0x2064615ecf8>]