from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=f)
import sklearn.model_selection as skms
x_train, x_test, y_train, y_test=skm.train_test_split(x,y,test_size=f)
OLS (ordinary least squares)
from sklearn import linear_model
reg=linear_model.LinearRegression()
reg.fit(x,y)
reg.predict(x_dash) #make predictions
Ridge Regression
Predictors are highly correlated. Impose a small penalty factor to shrink them.
alpha controls amount of shrinkage
from sklearn import linear_model
reg=linear_model.Ridge (alpha=n)
reg.fit(x,y)
reg.predict(x_dash)
LASSO Regression (Least Absolute Selection and Shrinkage Operator)
from sklearn import linear_model
reg=linear_model.Lasso (alpha=n)
reg.fit(x,y)
reg.predict(x_dash)
Regression metrics (The performance of the model)
MAE--mean absolute error [no direction]
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_pred)
RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_true,y_pred))
R2-Score--correlation between y_true and y_pred
from sklearn.metrics import r2_scoe
r2_score(y_true, y_pred)
Excercise
1 - Import the “boston” dataset from sklearn package. Do a linear regression by OLS, Ridge and Lasso. Please calculate a 10-fold cross validation and calculate RMSE for 10-fold cross validation of each method.
The advantage of k-fold CV is that all observations are used for both training and validation, and each observation is used for validation exactly once. Hint : Call KFold library from sklearn
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.datasets import load_boston
import numpy as np
boston = load_boston()
x = boston.data
y = boston.target
#========================= OLS =============================
print('-----------------------------------------------')
linreg = LinearRegression()
linreg.fit(x, y)
p = linreg.predict(x)
# Compute RMSE using 10-fold cross-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train, test in kf:
linreg.fit(x[train], y[train])
p = linreg.predict(x[test])
e = p - y[test]
xval_err += np.dot(e, e)
rmse_10cv = np.sqrt(xval_err / len(x))
)
print('OLS RMSE on 10-fold CV: %.4f' %rmse_10cv)
#======================= Ridge =============================
print('-----------------------------------------------')
ridge = Ridge(fit_intercept=True, alpha=0.5)
ridge.fit(x, y)
p = ridge.predict(x)
# Compute RMSE using 10-fold Cross-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
ridge.fit(x[train],y[train])
p = ridge.predict(x[test])
e = p-y[test]
xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))
print('Ridge RMSE on 10-fold CV: %.4f' %rmse_10cv)
#======================== Lasso ============================
print('-----------------------------------------------')
lasso = Lasso(fit_intercept=True, alpha=0.1)
lasso.fit(x, y)
p = lasso.predict(x)
# Compute RMSE using 10-fold cross-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
lasso.fit(x[train],y[train])
p = ridge.predict(x[test])
e = p-y[test]
xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))
print('Lasso RMSE on 10-fold CV: %.4f' %rmse_10cv)