# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv')
# First 1800 data items are training set; the next 600 are the validation set: the final 508 are the etst set
train = data.iloc[:1800]
val = data.iloc[1800:2400]
test=data.iloc[2400:2908]
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val,X_test = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1), test.drop('Sale Price', axis=1)
y_train, y_val,y_test = train[['Sale Price']], val[['Sale Price']], test[['Sale Price']]
# Importing models
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_train)
print(mse(y_train,pred))
pred=lr.predict(X_val)
print(mse(y_val,pred))
# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
[
['intercept'] + list(X_train.columns),
list(lr.intercept_) + list(lr.coef_[0])
]
).transpose().set_index(0)
coeffs
# Importing Ridge
from sklearn.linear_model import Ridge
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.6*1800, 1.0*1800]
mses=[]
for alpha in alphas:
ridge=Ridge(alpha=alpha)
ridge.fit(X_train,y_train)
pred=ridge.predict(X_val)
mses.append(mse(y_val,pred))
print(mse(y_val,pred))
plt.plot(alphas, mses)
# Import Lasso
from sklearn.linear_model import Lasso
# Here we produce results for alpha=0.05 which corresponds to lambda=0.1 in Hull's book
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)
# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
[
['intercept'] + list(X_train.columns),
list(lasso.intercept_) + list(lasso.coef_)
]
).transpose().set_index(0)
coeffs
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_val)
mses.append(mse(y_val,pred))
print(mse(y_val, pred))
plt.plot(alphas, mses)
# Calculate mse for test set when Hull's lambda =0.04
alpha=0.04/2
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_test)
print(mse(y_test,pred))
# Calculate mse for test set when Hull's lambda =0.1
alpha=0.1/2
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_test)
print(mse(y_test,pred))