Lab 6.5.3: PCR and PLS Regression#
Adapted from http://www.science.smith.edu/~jcrouser/SDS293/labs/
# imports and setup
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.float_format', '{:20,.5f}'.format) # get rid of scientific notation
plt.style.use('seaborn') # pretty matplotlib plots
/tmp/ipykernel_3660/659918127.py:14: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
plt.style.use('seaborn') # pretty matplotlib plots
hitters = pd.read_csv('../datasets/Hitters.csv', index_col=0).dropna()
hitters.index.name = 'Player'
hitters = hitters.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,18]] # push salary at the end
hitters['League'] = pd.get_dummies(hitters['League']).iloc[:, 1]
hitters['Division'] = pd.get_dummies(hitters['Division']).iloc[:, 1]
hitters['NewLeague'] = pd.get_dummies(hitters['NewLeague']).iloc[:, 1]
X = hitters.iloc[:, 0:19]
y = hitters.iloc[:, 19]
Principal Components Regression#
from sklearn.preprocessing import scale
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.metrics import mean_squared_error
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[3], line 2
1 from sklearn.preprocessing import scale
----> 2 from sklearn.cross_validation import KFold, cross_val_score
3 from sklearn.model_selection import train_test_split
4 from sklearn.decomposition import PCA
ModuleNotFoundError: No module named 'sklearn.cross_validation'
pca = PCA()
X_pca = pca.fit_transform(scale(X))
pd.DataFrame(X_pca)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 pca = PCA()
2 X_pca = pca.fit_transform(scale(X))
3 pd.DataFrame(X_pca)
NameError: name 'PCA' is not defined
# 10-fold CV, with shuffle
n = len(X_pca)
k10 = KFold(n, n_folds=10, shuffle=True, random_state=42)
lin_reg = LinearRegression()
rmse = []
# MSE with only the intercept
score = -1 * cross_val_score(lin_reg, np.ones((n,1)), y, cv=k10, scoring='neg_mean_squared_error').mean()
rmse.append(np.sqrt(score))
# MSE for the 19 principle components
for i in np.arange(1, 20):
score = -1 * cross_val_score(lin_reg, X_pca[:,:i], y, cv=k10, scoring='neg_mean_squared_error').mean()
rmse.append(np.sqrt(score))
min_rmse = np.array(rmse).argmin()
plt.plot(rmse, '-D')
plt.plot(rmse, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE');
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 2
1 # 10-fold CV, with shuffle
----> 2 n = len(X_pca)
3 k10 = KFold(n, n_folds=10, shuffle=True, random_state=42)
5 lin_reg = LinearRegression()
NameError: name 'X_pca' is not defined
np.cumsum(pca.explained_variance_ratio_*100)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 np.cumsum(pca.explained_variance_ratio_*100)
NameError: name 'pca' is not defined
# validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
pca2 = PCA()
X_train_pca2 = pca2.fit_transform(scale(X_train))
# 10-fold CV, with shuffle
n = len(X_train_pca2)
k102 = KFold(n, n_folds=10, shuffle=True, random_state=1)
lin_reg2 = LinearRegression()
rmse2 = []
# MSE with only the intercept
score = -1 * cross_val_score(lin_reg2, np.ones((n,1)), y_train, cv=k102, scoring='neg_mean_squared_error').mean()
rmse2.append(np.sqrt(score))
# MSE for the 19 principle components
for i in np.arange(1, 20):
score = -1 * cross_val_score(lin_reg2, X_train_pca2[:,:i], y_train, cv=k102, scoring='neg_mean_squared_error').mean()
rmse2.append(np.sqrt(score))
min_rmse = np.array(rmse2).argmin()
plt.plot(rmse2, '-D')
plt.plot(rmse2, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE')
plt.xticks([0, 5, 10, 15, 20]);
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 2
1 # validation set
----> 2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
4 pca2 = PCA()
5 X_train_pca2 = pca2.fit_transform(scale(X_train))
NameError: name 'train_test_split' is not defined
X_train_pca3 = pca2.fit_transform(scale(X_train))[:, :7]
X_test_pca3 = pca2.fit_transform(scale(X_test))[:, :7]
lin_reg3 = LinearRegression()
lin_reg3.fit(X_train_pca3, y_train)
pred_pca3 = lin_reg3.predict(X_test_pca3)
mean_squared_error(y_test, pred_pca3)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 1
----> 1 X_train_pca3 = pca2.fit_transform(scale(X_train))[:, :7]
2 X_test_pca3 = pca2.fit_transform(scale(X_test))[:, :7]
4 lin_reg3 = LinearRegression()
NameError: name 'pca2' is not defined
Partial Least Squares#
# 10-fold CV, with shuffle
n = len(X_train)
k10 = KFold(n, n_folds=10, shuffle=True, random_state=1)
rmse = []
for i in np.arange(1, 20):
pls = PLSRegression(n_components=i)
score = -1 * cross_val_score(pls, scale(X_train), y_train, cv=k10, scoring='neg_mean_squared_error').mean()
rmse.append(np.sqrt(score))
min_rmse = np.array(rmse).argmin()
plt.plot(rmse, '-D')
plt.plot(rmse, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE')
plt.xticks([0, 5, 10, 15, 20]);
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 2
1 # 10-fold CV, with shuffle
----> 2 n = len(X_train)
3 k10 = KFold(n, n_folds=10, shuffle=True, random_state=1)
5 rmse = []
NameError: name 'X_train' is not defined
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)
mean_squared_error(y_test, pls.predict(scale(X_test)))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 pls = PLSRegression(n_components=2)
2 pls.fit(scale(X_train), y_train)
4 mean_squared_error(y_test, pls.predict(scale(X_test)))
NameError: name 'PLSRegression' is not defined