Lab 5.3: Cross-Validation and the Bootstrap#
5.3.1 The Validation Set Approach#
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:20,.2f}'.format) # get rid of scientific notation
# load data
auto = pd.read_csv('../datasets/Auto.csv', na_values=['?'])
auto.dropna(axis=0, inplace=True)
auto.cylinders = auto.cylinders.astype('category')
auto.name = auto.name.astype('category')
auto.reset_index(inplace=True)
auto['horsepower_2'] = np.power(auto.horsepower, 2)
auto['horsepower_3'] = np.power(auto.horsepower, 3)
auto['horsepower_4'] = np.power(auto.horsepower, 4)
auto['horsepower_5'] = np.power(auto.horsepower, 5)
# Polynomial Features using sklearn:
from sklearn.preprocessing import PolynomialFeatures
pol = PolynomialFeatures(degree=5, interaction_only=False, include_bias=False)
polf = pol.fit_transform(auto.loc[:, 'horsepower'].values.reshape(-1, 1))
from sklearn.model_selection import train_test_split
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], auto.mpg
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# ols model with intercept
lm1 = LinearRegression(fit_intercept=True)
lm2 = LinearRegression(fit_intercept=True)
lm3 = LinearRegression(fit_intercept=True)
lm1_fit = lm1.fit(X_train.loc[:, 'horsepower'].values.reshape(-1, 1), y_train)
lm2_fit = lm2.fit(X_train.loc[:, ['horsepower', 'horsepower_2']], y_train)
lm3_fit = lm3.fit(X_train.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], y_train)
lm1_predict = lm1_fit.predict(X_test.loc[:, 'horsepower'].values.reshape(-1, 1))
lm2_predict = lm2_fit.predict(X_test.loc[:, ['horsepower', 'horsepower_2']])
lm3_predict = lm3_fit.predict(X_test.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']])
print('lm1 MSE:', mean_squared_error(y_test, lm1_predict))
print('lm2 MSE:', mean_squared_error(y_test, lm2_predict))
print('lm3 MSE:', mean_squared_error(y_test, lm3_predict))
lm1 MSE: 25.5738781896844
lm2 MSE: 22.218020050032855
lm3 MSE: 22.667675435534864
5.3.2 Leave-One-Out Cross-Validation#
from sklearn.model_selection import LeaveOneOut
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4', 'horsepower_5']], auto.mpg
loocv = LeaveOneOut()
loocv.get_n_splits(X)
392
loocv_mse = []
lm = LinearRegression(fit_intercept=True)
for train_index, test_index in loocv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
lm1_fit = lm.fit(X_train.loc[:, 'horsepower'].values.reshape(-1, 1), y_train)
lm1_predict = lm1_fit.predict(X_test.loc[:, 'horsepower'].values.reshape(-1, 1))
loocv_mse.append(mean_squared_error(y_test, lm1_predict))
np.array(loocv_mse).mean()
24.231513517929226
# using sklearn cross_validation_score
from sklearn.model_selection import cross_val_score
lm = LinearRegression(fit_intercept=True)
cval = cross_val_score(lm,
auto.loc[:, 'horsepower'].values.reshape(-1, 1),
auto.mpg,
cv=len(auto), # k=n k-Fold -> LOOCV
n_jobs=-1,
scoring='neg_mean_squared_error')
-cval.mean()
24.231513517929226
# Loop for 5 degree polinomial linear regressions with LOOCV
loocv_poly = {}
for i in range(1, 6):
loocv_mse = []
for train_index, test_index in loocv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
if i == 1:
X_TRAIN = X_train.iloc[:,0:1].values.reshape(-1, 1)
X_TEST = X_test.iloc[:,0:1].values.reshape(-1, 1)
else:
X_TRAIN = X_train.iloc[:,0:i]
X_TEST = X_test.iloc[:,0:i]
loocv_mse.append(
mean_squared_error(
y_test,
LinearRegression(fit_intercept=True)
.fit(
X_TRAIN,
y_train
)
.predict(
X_TEST
)
)
)
loocv_poly['lm' + str(i) + '_MSE'] = np.array(loocv_mse).mean()
loocv_poly
{'lm1_MSE': 24.231513517929226,
'lm2_MSE': 19.24821312448967,
'lm3_MSE': 19.33498406402931,
'lm4_MSE': 19.424430310232463,
'lm5_MSE': 19.033212486158245}
5.3.3 k-Fold Cross-Validation#
from sklearn.model_selection import KFold
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4', 'horsepower_5']], auto.mpg
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Loop for 5 degree polinomial linear regressions with k-Fold CV
kf_poly = {}
for i in range(1, 6):
kf_mse = []
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
if i == 1:
X_TRAIN = X_train.iloc[:,0:1].values.reshape(-1, 1)
X_TEST = X_test.iloc[:,0:1].values.reshape(-1, 1)
else:
X_TRAIN = X_train.iloc[:,0:i]
X_TEST = X_test.iloc[:,0:i]
kf_mse.append(
mean_squared_error(
y_test,
LinearRegression(fit_intercept=True)
.fit(
X_TRAIN,
y_train
)
.predict(
X_TEST
)
)
)
kf_poly['lm' + str(i) + '_MSE'] = np.array(kf_mse).mean()
kf_poly
{'lm1_MSE': 24.199808197692477,
'lm2_MSE': 19.228636614268016,
'lm3_MSE': 19.26626534663179,
'lm4_MSE': 19.351092272836148,
'lm5_MSE': 19.023233396518712}
5.3.4 The Bootstrap#
portfolio = pd.read_csv('../datasets/Portfolio.csv', index_col=0)
def alpha_fn(data, start_index, end_index):
X = data['X'][start_index:end_index]
Y = data['Y'][start_index:end_index]
return ((np.var(Y) - np.cov(X, Y)[0][1]) / (np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0][1]))
alpha_fn(portfolio, 0, 100)
0.5766511516104116
from sklearn.utils import resample
portfolio_bs = resample(portfolio, replace=True, n_samples=100)
alpha_fn(portfolio_bs, 0, 100)
0.47059687165130554
# Bootstrap is removed from sklearn
bs_alpha = []
for i in range(0, 1000):
bs_alpha.append(
alpha_fn(resample(portfolio, replace=True, n_samples=100), 0, 100)
)
bs_alpha = np.array(bs_alpha)
print('Bootstrapped alpha:', bs_alpha.mean())
print('SE:', bs_alpha.std())
Bootstrapped alpha: 0.5809082415607475
SE: 0.09294250053349355
def boot_fn(data, start_index, end_index):
m = LinearRegression(fit_intercept=True).fit(
data['horsepower'][start_index:end_index].values.reshape(-1, 1),
data['mpg'][start_index:end_index]
)
return m.intercept_, m.coef_
boot_fn(auto, 0, 392)
(39.93586102117047, array([-0.15784473]))
boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)
(38.78832788237536, array([-0.14804025]))
bs_boot = {'t1': [], 't2': []}
for i in range(0, 1000):
bs_boot['t1'].append(
boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)[0]
)
bs_boot['t2'].append(
boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)[1][0]
)
t1_es = np.array(bs_boot['t1']).mean()
t1_se = np.array(bs_boot['t1']).std()
t2_es = np.array(bs_boot['t2']).mean()
t2_se = np.array(bs_boot['t2']).std()
print('t1 bs estimate & se:', t1_es, t1_se)
print('t2 bs estimate & se:', t2_es, t2_se)
t1 bs estimate & se: 39.98982301627503 0.8399461113100991
t2 bs estimate & se: -0.15844201311684047 0.007444334463848632
import statsmodels.formula.api as sm
ols = sm.ols('mpg ~ horsepower', data=auto).fit()
ols.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 39.9359 | 0.717 | 55.660 | 0.000 | 38.525 | 41.347 |
horsepower | -0.1578 | 0.006 | -24.489 | 0.000 | -0.171 | -0.145 |
def boot_fn2(data, start_index, end_index):
m = LinearRegression(fit_intercept=True).fit(
data[['horsepower', 'horsepower_2']][start_index:end_index],
data['mpg'][start_index:end_index]
)
return m.intercept_, m.coef_
bs_boot2 = {'t1': [], 't2': [], 't3': []}
for i in range(0, 1000):
bs_boot2['t1'].append(
boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[0]
)
bs_boot2['t2'].append(
boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[1][0]
)
bs_boot2['t3'].append(
boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[1][1]
)
t1_es = np.array(bs_boot2['t1']).mean()
t1_se = np.array(bs_boot2['t1']).std()
t2_es = np.array(bs_boot2['t2']).mean()
t2_se = np.array(bs_boot2['t2']).std()
t3_es = np.array(bs_boot2['t3']).mean()
t3_se = np.array(bs_boot2['t3']).std()
print('t1 bs estimate & se:', t1_es, t1_se)
print('t2 bs estimate & se:', t2_es, t2_se)
print('t3 bs estimate & se:', t3_es, t3_se)
t1 bs estimate & se: 56.8883390759425 2.1408687186427504
t2 bs estimate & se: -0.4676938954016412 0.03291181969884338
t3 bs estimate & se: 0.0012281216675776307 0.00011961428506476293
import statsmodels.formula.api as sm
ols2 = sm.ols('mpg ~ horsepower + horsepower_2', data=auto).fit()
ols2.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 56.9001 | 1.800 | 31.604 | 0.000 | 53.360 | 60.440 |
horsepower | -0.4662 | 0.031 | -14.978 | 0.000 | -0.527 | -0.405 |
horsepower_2 | 0.0012 | 0.000 | 10.080 | 0.000 | 0.001 | 0.001 |