Lab 6.5.3: PCR and PLS Regression#

Adapted from http://www.science.smith.edu/~jcrouser/SDS293/labs/

# imports and setup
%matplotlib inline

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.float_format', '{:20,.5f}'.format) # get rid of scientific notation

plt.style.use('seaborn') # pretty matplotlib plots
/tmp/ipykernel_3660/659918127.py:14: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
  plt.style.use('seaborn') # pretty matplotlib plots
hitters = pd.read_csv('../datasets/Hitters.csv', index_col=0).dropna()
hitters.index.name = 'Player'

hitters = hitters.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,18]] # push salary at the end

hitters['League'] = pd.get_dummies(hitters['League']).iloc[:, 1]
hitters['Division'] = pd.get_dummies(hitters['Division']).iloc[:, 1]
hitters['NewLeague'] = pd.get_dummies(hitters['NewLeague']).iloc[:, 1]

X = hitters.iloc[:, 0:19]
y = hitters.iloc[:, 19]

Principal Components Regression#

from sklearn.preprocessing import scale 
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.metrics import mean_squared_error
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[3], line 2
      1 from sklearn.preprocessing import scale 
----> 2 from sklearn.cross_validation import KFold, cross_val_score
      3 from sklearn.model_selection import train_test_split
      4 from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'sklearn.cross_validation'
pca = PCA()
X_pca = pca.fit_transform(scale(X))
pd.DataFrame(X_pca)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 pca = PCA()
      2 X_pca = pca.fit_transform(scale(X))
      3 pd.DataFrame(X_pca)

NameError: name 'PCA' is not defined
# 10-fold CV, with shuffle
n = len(X_pca)
k10 = KFold(n, n_folds=10, shuffle=True, random_state=42)

lin_reg = LinearRegression()
rmse = []

# MSE with only the intercept
score = -1 * cross_val_score(lin_reg, np.ones((n,1)), y, cv=k10, scoring='neg_mean_squared_error').mean()
rmse.append(np.sqrt(score))

# MSE for the 19 principle components
for i in np.arange(1, 20):
    score = -1 * cross_val_score(lin_reg, X_pca[:,:i], y, cv=k10, scoring='neg_mean_squared_error').mean()
    rmse.append(np.sqrt(score))
    
min_rmse = np.array(rmse).argmin()
    
plt.plot(rmse, '-D')
plt.plot(rmse, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 2
      1 # 10-fold CV, with shuffle
----> 2 n = len(X_pca)
      3 k10 = KFold(n, n_folds=10, shuffle=True, random_state=42)
      5 lin_reg = LinearRegression()

NameError: name 'X_pca' is not defined
np.cumsum(pca.explained_variance_ratio_*100)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 np.cumsum(pca.explained_variance_ratio_*100)

NameError: name 'pca' is not defined
# validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

pca2 = PCA()
X_train_pca2 = pca2.fit_transform(scale(X_train))

# 10-fold CV, with shuffle
n = len(X_train_pca2)
k102 = KFold(n, n_folds=10, shuffle=True, random_state=1)

lin_reg2 = LinearRegression()
rmse2 = []

# MSE with only the intercept
score = -1 * cross_val_score(lin_reg2, np.ones((n,1)), y_train, cv=k102, scoring='neg_mean_squared_error').mean()
rmse2.append(np.sqrt(score))

# MSE for the 19 principle components
for i in np.arange(1, 20):
    score = -1 * cross_val_score(lin_reg2, X_train_pca2[:,:i], y_train, cv=k102, scoring='neg_mean_squared_error').mean()
    rmse2.append(np.sqrt(score))

min_rmse = np.array(rmse2).argmin()
    
plt.plot(rmse2, '-D')
plt.plot(rmse2, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE')
plt.xticks([0, 5, 10, 15, 20]);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 2
      1 # validation set
----> 2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
      4 pca2 = PCA()
      5 X_train_pca2 = pca2.fit_transform(scale(X_train))

NameError: name 'train_test_split' is not defined
X_train_pca3 = pca2.fit_transform(scale(X_train))[:, :7]
X_test_pca3 = pca2.fit_transform(scale(X_test))[:, :7]

lin_reg3 = LinearRegression()
lin_reg3.fit(X_train_pca3, y_train)
pred_pca3 = lin_reg3.predict(X_test_pca3)

mean_squared_error(y_test, pred_pca3)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 X_train_pca3 = pca2.fit_transform(scale(X_train))[:, :7]
      2 X_test_pca3 = pca2.fit_transform(scale(X_test))[:, :7]
      4 lin_reg3 = LinearRegression()

NameError: name 'pca2' is not defined

Partial Least Squares#

# 10-fold CV, with shuffle
n = len(X_train)
k10 = KFold(n, n_folds=10, shuffle=True, random_state=1)

rmse = []

for i in np.arange(1, 20):
    pls = PLSRegression(n_components=i)
    score = -1 * cross_val_score(pls, scale(X_train), y_train, cv=k10, scoring='neg_mean_squared_error').mean()
    rmse.append(np.sqrt(score))
    
min_rmse = np.array(rmse).argmin()

plt.plot(rmse, '-D')
plt.plot(rmse, markevery=[min_rmse], marker='X', lw=0, color='red')
plt.xlabel('# of PC')
plt.ylabel('RMSE')
plt.xticks([0, 5, 10, 15, 20]);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 2
      1 # 10-fold CV, with shuffle
----> 2 n = len(X_train)
      3 k10 = KFold(n, n_folds=10, shuffle=True, random_state=1)
      5 rmse = []

NameError: name 'X_train' is not defined
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)

mean_squared_error(y_test, pls.predict(scale(X_test)))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 pls = PLSRegression(n_components=2)
      2 pls.fit(scale(X_train), y_train)
      4 mean_squared_error(y_test, pls.predict(scale(X_test)))

NameError: name 'PLSRegression' is not defined