Lab 7.8: Non-linear Modeling#

# imports and setup
%matplotlib inline

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 12)
pd.set_option('display.float_format', '{:20,.5f}'.format) # get rid of scientific notation

plt.style.use('seaborn') # pretty matplotlib plots
/tmp/ipykernel_3689/1373071590.py:14: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
  plt.style.use('seaborn') # pretty matplotlib plots

7.8.1 Polynomial Regression and Step Functions#

wage = pd.read_csv('../datasets/Wage.csv', index_col=0)
wage.iloc[:, 2:10] = wage.iloc[:, 2:10].apply(pd.Categorical)
from sklearn.preprocessing import PolynomialFeatures

pol = PolynomialFeatures(degree=5, interaction_only=False, include_bias=False)
polf = pol.fit_transform(wage.loc[:, 'age'].values.reshape(-1, 1))

wage['age_2'] = polf[:,1]
wage['age_3'] = polf[:,2]
wage['age_4'] = polf[:,3]
wage['age_5'] = polf[:,4]

wage['wage_250'] = (wage.wage > 250).map({True: 1, False: 0})

X = wage.loc[:, ['age', 'age_2', 'age_3', 'age_4']]
y = wage.wage
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X, y)
lin_reg.intercept_, lin_reg.coef_
(-184.1541797734385,
 array([ 2.12455205e+01, -5.63859313e-01,  6.81068771e-03, -3.20383037e-05]))
# Easy to plot higher polynomial order regressions from seaborn

sns.regplot(x='age', y='wage', data=wage, order=4, 
            scatter_kws={'alpha': 0.2, 'color': 'gray', 'facecolor': None});
../../_images/493ecfdac4ec3a2b73dbc33bb84ab6992314d4f4129ba34e41c9e44898995f40.png
# ANOVA

import statsmodels.api as sm
from statsmodels.formula.api import ols

lin_reg_sm = ols(formula='wage ~ age + age_2 + age_3 + age_4 + age_5', data=wage).fit()
anova_sm = sm.stats.anova_lm(lin_reg_sm, typ=1) # Type 1 for compatibility with the book

anova_sm
df sum_sq mean_sq F PR(>F)
age 1.00000 199,869.66497 199,869.66497 125.44432 0.00000
age_2 1.00000 228,786.01013 228,786.01013 143.59311 0.00000
age_3 1.00000 15,755.69366 15,755.69366 9.88876 0.00168
age_4 1.00000 6,070.15212 6,070.15212 3.80981 0.05105
age_5 1.00000 1,282.56302 1,282.56302 0.80498 0.36968
Residual 2,994.00000 4,770,321.68581 1,593.29382 NaN NaN
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(fit_intercept=True, C=1e9)
log_reg.fit(X, wage.wage_250)

log_reg.intercept_, log_reg.coef_
(array([-5.64988424e-05]),
 array([[-1.12834483e-03, -1.60767974e-02,  5.10936280e-04,
         -4.28423096e-06]]))
# only way to get R compatible results from logistic regression? why?

from statsmodels.formula.api import glm

lr_sm = glm(formula='wage_250 ~ age + age_2 + age_3 + age_4', 
            data=wage,
            family=sm.families.Binomial(sm.families.links.logit)).fit()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[8], line 7
      1 # only way to get R compatible results from logistic regression? why?
      3 from statsmodels.formula.api import glm
      5 lr_sm = glm(formula='wage_250 ~ age + age_2 + age_3 + age_4', 
      6             data=wage,
----> 7             family=sm.families.Binomial(sm.families.links.logit)).fit()

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/statsmodels/genmod/families/family.py:932, in Binomial.__init__(self, link, check_link)
    929 self.n = 1
    930 # overwritten by initialize if needed but always used to initialize
    931 # variance since endog is assumed/forced to be (0,1)
--> 932 super(Binomial, self).__init__(
    933     link=link,
    934     variance=V.Binomial(n=self.n),
    935     check_link=check_link
    936     )

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/statsmodels/genmod/families/family.py:94, in Family.__init__(self, link, variance, check_link)
     89 if inspect.isclass(link):
     90     warnmssg = (
     91         "Calling Family(..) with a link class is not allowed. Use an "
     92         "instance of a link class instead."
     93     )
---> 94     raise TypeError(warnmssg)
     96 self.link = link
     97 self.variance = variance

TypeError: Calling Family(..) with a link class is not allowed. Use an instance of a link class instead.
age_grid = np.arange(wage.age.min(), wage.age.max()).reshape(-1,1)

X_test = pd.DataFrame(PolynomialFeatures(4, include_bias=True).fit_transform(age_grid),
                      columns=['Intercept', 'age', 'age_2', 'age_3', 'age_4'])

log_reg_pred = lr_sm.predict(X_test)

plt.scatter(wage.age, wage.wage_250/5, marker='|', color='gray', alpha=0.5)
plt.plot(age_grid, log_reg_pred);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 6
      1 age_grid = np.arange(wage.age.min(), wage.age.max()).reshape(-1,1)
      3 X_test = pd.DataFrame(PolynomialFeatures(4, include_bias=True).fit_transform(age_grid),
      4                       columns=['Intercept', 'age', 'age_2', 'age_3', 'age_4'])
----> 6 log_reg_pred = lr_sm.predict(X_test)
      8 plt.scatter(wage.age, wage.wage_250/5, marker='|', color='gray', alpha=0.5)
      9 plt.plot(age_grid, log_reg_pred);

NameError: name 'lr_sm' is not defined
age_step, bins = pd.cut(wage.age, 4, retbins=True, right=True)
bins
array([17.938, 33.5  , 49.   , 64.5  , 80.   ])
wage_step = pd.concat([wage.age, age_step, wage.wage, wage.wage_250], 
                      keys=['age', 'age_step', 'wage', 'wage_250'], 
                      axis=1)


X = pd.get_dummies(wage_step.age_step)
X = X.drop(X.columns[0], axis=1)
y = wage_step.wage

lr_step = LinearRegression(fit_intercept=True)

lr_step.fit(X, y)
lr_step.intercept_, lr_step.coef_
(94.15839203351902, array([24.05349138, 23.664559  ,  7.6405917 ]))
age_grid_bins = np.digitize(age_grid.ravel(), bins)

X_test = pd.get_dummies(age_grid_bins).drop(1, axis=1)

lin_pred = lr_step.predict(X_test)

plt.scatter(wage_step.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
plt.plot(age_grid, lin_pred);
../../_images/3d90f28243babcc6dcdb0d49db0143ba684d5cfd51f71f504c3a74887635f001.png
from statsmodels.formula.api import GLM

lr_sm = GLM(wage_step.wage_250, sm.add_constant(X),
            family=sm.families.Binomial(sm.families.links.logit)).fit()

log_pred = lr_sm.predict(sm.add_constant(X_test))

plt.scatter(wage_step.age, wage_step.wage_250/5, marker='|', color='gray', alpha=0.5)
plt.plot(age_grid, log_pred);
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[13], line 1
----> 1 from statsmodels.formula.api import GLM
      3 lr_sm = GLM(wage_step.wage_250, sm.add_constant(X),
      4             family=sm.families.Binomial(sm.families.links.logit)).fit()
      6 log_pred = lr_sm.predict(sm.add_constant(X_test))

ImportError: cannot import name 'GLM' from 'statsmodels.formula.api' (/opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/statsmodels/formula/api.py)

7.8.2 Splines#

Cubic Spline with specified knots

from patsy import dmatrix

age_spline1 = dmatrix('bs(age, knots=(25,40,60), degree=3, include_intercept=False)',
                     data=wage,
                     return_type='dataframe')

age_grid_spline1 = dmatrix('bs(age_grid, knots=(25,40,60), degree=3, include_intercept=False)',
                           {'age_grid': age_grid},
                           return_type='dataframe')

spline_reg1 = LinearRegression(fit_intercept=False)
spline_reg1.fit(age_spline1, wage.wage)

spline_pred1 = spline_reg1.predict(age_grid_spline1)

plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
plt.plot(age_grid, spline_pred1);
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[14], line 14
     11 spline_reg1 = LinearRegression(fit_intercept=False)
     12 spline_reg1.fit(age_spline1, wage.wage)
---> 14 spline_pred1 = spline_reg1.predict(age_grid_spline1)
     16 plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
     17 plt.plot(age_grid, spline_pred1);

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:386, in LinearModel.predict(self, X)
    372 def predict(self, X):
    373     """
    374     Predict using the linear model.
    375 
   (...)
    384         Returns predicted values.
    385     """
--> 386     return self._decision_function(X)

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:369, in LinearModel._decision_function(self, X)
    366 def _decision_function(self, X):
    367     check_is_fitted(self)
--> 369     X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
    370     return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:580, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    509 def _validate_data(
    510     self,
    511     X="no_validation",
   (...)
    516     **check_params,
    517 ):
    518     """Validate input data and set or check the `n_features_in_` attribute.
    519 
    520     Parameters
   (...)
    578         validated.
    579     """
--> 580     self._check_feature_names(X, reset=reset)
    582     if y is None and self._get_tags()["requires_y"]:
    583         raise ValueError(
    584             f"This {self.__class__.__name__} estimator "
    585             "requires y to be passed, but the target y is None."
    586         )

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:507, in BaseEstimator._check_feature_names(self, X, reset)
    502 if not missing_names and not unexpected_names:
    503     message += (
    504         "Feature names must be in the same order as they were in fit.\n"
    505     )
--> 507 raise ValueError(message)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- bs(age_grid, knots=(25, 40, 60), degree=3, include_intercept=False)[0]
- bs(age_grid, knots=(25, 40, 60), degree=3, include_intercept=False)[1]
- bs(age_grid, knots=(25, 40, 60), degree=3, include_intercept=False)[2]
- bs(age_grid, knots=(25, 40, 60), degree=3, include_intercept=False)[3]
- bs(age_grid, knots=(25, 40, 60), degree=3, include_intercept=False)[4]
- ...
Feature names seen at fit time, yet now missing:
- bs(age, knots=(25, 40, 60), degree=3, include_intercept=False)[0]
- bs(age, knots=(25, 40, 60), degree=3, include_intercept=False)[1]
- bs(age, knots=(25, 40, 60), degree=3, include_intercept=False)[2]
- bs(age, knots=(25, 40, 60), degree=3, include_intercept=False)[3]
- bs(age, knots=(25, 40, 60), degree=3, include_intercept=False)[4]
- ...

Cubic spline with specified degrees of freedom

age_spline2 = dmatrix('bs(age, df=6, include_intercept=False)',
                     data=wage,
                     return_type='dataframe')

age_grid_spline2 = dmatrix('bs(age_grid, df=6, include_intercept=False)',
                           {'age_grid': age_grid},
                           return_type='dataframe')

spline_reg2 = LinearRegression(fit_intercept=False)
spline_reg2.fit(age_spline2, wage.wage)

spline_pred2 = spline_reg2.predict(age_grid_spline2)

plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
plt.plot(age_grid, spline_pred2);
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[15], line 12
      9 spline_reg2 = LinearRegression(fit_intercept=False)
     10 spline_reg2.fit(age_spline2, wage.wage)
---> 12 spline_pred2 = spline_reg2.predict(age_grid_spline2)
     14 plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
     15 plt.plot(age_grid, spline_pred2);

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:386, in LinearModel.predict(self, X)
    372 def predict(self, X):
    373     """
    374     Predict using the linear model.
    375 
   (...)
    384         Returns predicted values.
    385     """
--> 386     return self._decision_function(X)

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:369, in LinearModel._decision_function(self, X)
    366 def _decision_function(self, X):
    367     check_is_fitted(self)
--> 369     X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
    370     return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:580, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    509 def _validate_data(
    510     self,
    511     X="no_validation",
   (...)
    516     **check_params,
    517 ):
    518     """Validate input data and set or check the `n_features_in_` attribute.
    519 
    520     Parameters
   (...)
    578         validated.
    579     """
--> 580     self._check_feature_names(X, reset=reset)
    582     if y is None and self._get_tags()["requires_y"]:
    583         raise ValueError(
    584             f"This {self.__class__.__name__} estimator "
    585             "requires y to be passed, but the target y is None."
    586         )

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:507, in BaseEstimator._check_feature_names(self, X, reset)
    502 if not missing_names and not unexpected_names:
    503     message += (
    504         "Feature names must be in the same order as they were in fit.\n"
    505     )
--> 507 raise ValueError(message)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- bs(age_grid, df=6, include_intercept=False)[0]
- bs(age_grid, df=6, include_intercept=False)[1]
- bs(age_grid, df=6, include_intercept=False)[2]
- bs(age_grid, df=6, include_intercept=False)[3]
- bs(age_grid, df=6, include_intercept=False)[4]
- ...
Feature names seen at fit time, yet now missing:
- bs(age, df=6, include_intercept=False)[0]
- bs(age, df=6, include_intercept=False)[1]
- bs(age, df=6, include_intercept=False)[2]
- bs(age, df=6, include_intercept=False)[3]
- bs(age, df=6, include_intercept=False)[4]
- ...

Natural Spline

age_spline3 = dmatrix('cr(age, df=4)',
                     data=wage,
                     return_type='dataframe')

age_grid_spline3 = dmatrix('cr(age_grid, df=4)',
                           {'age_grid': age_grid},
                           return_type='dataframe')

spline_reg3 = LinearRegression(fit_intercept=False)
spline_reg3.fit(age_spline3, wage.wage)

spline_pred3 = spline_reg3.predict(age_grid_spline3)

plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
plt.plot(age_grid, spline_pred3);
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[16], line 12
      9 spline_reg3 = LinearRegression(fit_intercept=False)
     10 spline_reg3.fit(age_spline3, wage.wage)
---> 12 spline_pred3 = spline_reg3.predict(age_grid_spline3)
     14 plt.scatter(wage.age, wage.wage, marker='o', color='gray', alpha=0.5, facecolor='None')
     15 plt.plot(age_grid, spline_pred3);

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:386, in LinearModel.predict(self, X)
    372 def predict(self, X):
    373     """
    374     Predict using the linear model.
    375 
   (...)
    384         Returns predicted values.
    385     """
--> 386     return self._decision_function(X)

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/linear_model/_base.py:369, in LinearModel._decision_function(self, X)
    366 def _decision_function(self, X):
    367     check_is_fitted(self)
--> 369     X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
    370     return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:580, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    509 def _validate_data(
    510     self,
    511     X="no_validation",
   (...)
    516     **check_params,
    517 ):
    518     """Validate input data and set or check the `n_features_in_` attribute.
    519 
    520     Parameters
   (...)
    578         validated.
    579     """
--> 580     self._check_feature_names(X, reset=reset)
    582     if y is None and self._get_tags()["requires_y"]:
    583         raise ValueError(
    584             f"This {self.__class__.__name__} estimator "
    585             "requires y to be passed, but the target y is None."
    586         )

File /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/base.py:507, in BaseEstimator._check_feature_names(self, X, reset)
    502 if not missing_names and not unexpected_names:
    503     message += (
    504         "Feature names must be in the same order as they were in fit.\n"
    505     )
--> 507 raise ValueError(message)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cr(age_grid, df=4)[0]
- cr(age_grid, df=4)[1]
- cr(age_grid, df=4)[2]
- cr(age_grid, df=4)[3]
Feature names seen at fit time, yet now missing:
- cr(age, df=4)[0]
- cr(age, df=4)[1]
- cr(age, df=4)[2]
- cr(age, df=4)[3]

Couldn’t find a way to implement smooth.spline() function in Python

7.8.3 GAMs#

year_spl = dmatrix('cr(year, df=4)',
                   data=wage,
                   return_type='dataframe').drop('Intercept', axis=1)

age_spl = dmatrix('cr(age, df=5)',
                  data=wage,
                  return_type='dataframe').drop('Intercept', axis=1)

edu_gam = pd.get_dummies(wage.education).iloc[:, 1:]


wage_gam = pd.concat([wage.wage, year_spl, age_spl, edu_gam], axis=1)

X = wage_gam.iloc[:, 1:14]
y = wage_gam.wage

gam = LinearRegression(fit_intercept=True)
gam.fit(X, y)

gam.intercept_, gam.coef_
(447295243734966.25,
 array([-2.89259755e+14, -2.89259755e+14, -2.89259755e+14, -2.89259755e+14,
        -1.58035489e+14, -1.58035489e+14, -1.58035489e+14, -1.58035489e+14,
        -1.58035489e+14,  1.08027344e+01,  2.31533203e+01,  3.78828125e+01,
         6.23183594e+01]))