Lab 3.6: Linear Regression

Lab 3.6: Linear Regression#

3.6.2 Simple Linear Regression#

import warnings

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


warnings.filterwarnings("ignore")

%matplotlib inline
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', '{:20,.2f}'.format) # get rid of scientific notation
plt.style.use('seaborn') # pretty matplotlib plots

# load data
boston = pd.read_csv('../datasets/Boston.csv', index_col=0)
boston

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
1	0.01	18.00	2.31	0	0.54	6.58	65.20	4.09	1	296	15.30	396.90	4.98	24.00
2	0.03	0.00	7.07	0	0.47	6.42	78.90	4.97	2	242	17.80	396.90	9.14	21.60
3	0.03	0.00	7.07	0	0.47	7.18	61.10	4.97	2	242	17.80	392.83	4.03	34.70
4	0.03	0.00	2.18	0	0.46	7.00	45.80	6.06	3	222	18.70	394.63	2.94	33.40
5	0.07	0.00	2.18	0	0.46	7.15	54.20	6.06	3	222	18.70	396.90	5.33	36.20
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
502	0.06	0.00	11.93	0	0.57	6.59	69.10	2.48	1	273	21.00	391.99	9.67	22.40
503	0.05	0.00	11.93	0	0.57	6.12	76.70	2.29	1	273	21.00	396.90	9.08	20.60
504	0.06	0.00	11.93	0	0.57	6.98	91.00	2.17	1	273	21.00	396.90	5.64	23.90
505	0.11	0.00	11.93	0	0.57	6.79	89.30	2.39	1	273	21.00	393.45	6.48	22.00
506	0.05	0.00	11.93	0	0.57	6.03	80.80	2.50	1	273	21.00	396.90	7.88	11.90

506 rows × 14 columns

Using scikit-learn#

from sklearn import linear_model

# ols model with intercept
ols_sl = linear_model.LinearRegression(fit_intercept=True) 

# fitted ols model (.values.reshape(-1, 1) is required for single predictor?)
x_train = boston['lstat'].values.reshape(-1, 1)
y_true = boston['medv']
ols_sl.fit(x_train, y_true)

# summary
ols_sl.intercept_, ols_sl.coef_

(34.5538408793831, array([-0.95004935]))

# metrics
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score

y_pred = ols_sl.predict(boston['lstat'].values.reshape(-1, 1))

ols_sl_summary = {'R2': r2_score(y_true, y_pred), 
                  'Ex. Var': explained_variance_score(y_true, y_pred), 
                  'MSE': mean_squared_error(y_true, y_pred)}

for k, v in ols_sl_summary.items():
    print(k, ':', v)

R2 : 0.5441462975864797
Ex. Var : 0.5441462975864798
MSE : 38.48296722989415

# out-of-sample predictions
ols_sl.predict(np.array([5, 10, 15]).reshape(-1, 1))

array([29.80359411, 25.05334734, 20.30310057])

Using statsmodels#

# using statsmodels
import statsmodels.api as sm

# predictor & dependent var
x_train = boston['lstat']
y_true = boston['medv']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train))

# fitted model and summary
ols_sm_results = ols_sm.fit()
ols_sm_results.summary()

# robust SE
#ols_sm_robust = sm.RLM(boston['medv'], X, M=sm.robust.norms.LeastSquares())
#ols_sm_robust.fit(cov='H2').summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.544
Model:	OLS	Adj. R-squared:	0.543
Method:	Least Squares	F-statistic:	601.6
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	5.08e-88
Time:	11:47:24	Log-Likelihood:	-1641.5
No. Observations:	506	AIC:	3287.
Df Residuals:	504	BIC:	3295.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	34.5538	0.563	61.415	0.000	33.448	35.659
lstat	-0.9500	0.039	-24.528	0.000	-1.026	-0.874

Omnibus:	137.043	Durbin-Watson:	0.892
Prob(Omnibus):	0.000	Jarque-Bera (JB):	291.373
Skew:	1.453	Prob(JB):	5.36e-64
Kurtosis:	5.319	Cond. No.	29.7

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# out-of-sample predictions
ols_sm_results.predict(sm.add_constant([5, 10, 15]))

array([29.80359411, 25.05334734, 20.30310057])

# prediction intervals
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import OrderedDict

y_pred = ols_sm_results.predict(sm.add_constant(x_train))
prstd, iv_l, iv_u = wls_prediction_std(ols_sm_results)

pred_dict = OrderedDict({'x_train': x_train,
                         'y_pred': y_pred, 
                         'y_true': y_true, 
                         'lwr': iv_l, 
                         'upr': iv_u, 
                         'pred_se': prstd})

pd.DataFrame(pred_dict)

	x_train	y_pred	y_true	lwr	upr	pred_se
1	4.98	29.82	24.00	17.58	42.06	6.23
2	9.14	25.87	21.60	13.64	38.10	6.22
3	4.03	30.73	34.70	18.48	42.97	6.23
4	2.94	31.76	33.40	19.51	44.01	6.23
5	5.33	29.49	36.20	17.25	41.73	6.23
...	...	...	...	...	...	...
502	9.67	25.37	22.40	13.14	37.59	6.22
503	9.08	25.93	20.60	13.70	38.15	6.22
504	5.64	29.20	23.90	16.96	41.43	6.23
505	6.48	28.40	22.00	16.16	40.63	6.23
506	7.88	27.07	11.90	14.84	39.30	6.22

506 rows × 6 columns

sns.regplot('lstat', 'medv', data=boston);

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 sns.regplot('lstat', 'medv', data=boston);

TypeError: regplot() got multiple values for argument 'data'

from statsmodels.graphics.regressionplots import *

ols_sm_resid = ols_sm_results.resid # residuals
ols_sm_resid_stud = ols_sm_resid / prstd # studentized residuals

f, axes = plt.subplots(2, 2, sharex=False, sharey=False) 
f.set_figheight(10)
f.set_figwidth(15)

sns.regplot('lstat', 'medv', data=boston, ax=axes[0, 0], scatter_kws={'alpha': 0.5}) # regression plot
sns.residplot(y_pred, 'medv', data=boston, ax=axes[0, 1], scatter_kws={'alpha': 0.5}) # residual plot

#plot_leverage_resid2(ols_sm_results, ax=axes[1, 0], color='red') # leverage plot

# custom leverage plot instead of above
from statsmodels.stats.outliers_influence import OLSInfluence
from scipy.stats import zscore
norm_resid = zscore(ols_sm_resid)
leverage = OLSInfluence(ols_sm_results).hat_matrix_diag
axes[1, 0].autoscale(enable=True, axis='y', tight=True)
axes[1, 0].scatter(norm_resid ** 2, leverage, alpha=0.5, color='red')

# studentized residual plot
axes[1, 1].scatter(y_pred, ols_sm_resid_stud, alpha=0.5, color='magenta')
axes[1, 1].axhline(0, ls=":", c=".2")
axes[1, 1].axhline(-3, ls=":", c=".6")
axes[1, 1].axhline(3, ls=":", c=".6")
axes[1, 1].set_ylim(-5, 5)

x = y_pred[np.logical_or(ols_sm_resid_stud > 3, ols_sm_resid_stud < -3)]
y = ols_sm_resid_stud[np.logical_or(ols_sm_resid_stud > 3, ols_sm_resid_stud < -3)]

for i, x, y in zip(x.index, x, y):
    axes[1, 1].annotate(i, xy=(x, y));

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[10], line 10
      7 f.set_figheight(10)
      8 f.set_figwidth(15)
---> 10 sns.regplot('lstat', 'medv', data=boston, ax=axes[0, 0], scatter_kws={'alpha': 0.5}) # regression plot
     11 sns.residplot(y_pred, 'medv', data=boston, ax=axes[0, 1], scatter_kws={'alpha': 0.5}) # residual plot
     13 #plot_leverage_resid2(ols_sm_results, ax=axes[1, 0], color='red') # leverage plot
     14 
     15 # custom leverage plot instead of above

TypeError: regplot() got multiple values for argument 'data'

../../_images/d1ba2080c9b0a52ae2d52e1e660dcba0ba8ed8c887ced30d0a1236f626d86bc9.png

# item with the highest leverage (0-indexed)

leverage.argmax(), leverage.max()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 3
      1 # item with the highest leverage (0-indexed)
----> 3 leverage.argmax(), leverage.max()

NameError: name 'leverage' is not defined

3.6.3 Multiple Linear Regression#

# predictors & dependent var
x_train = boston[['lstat', 'age']]
y_true = boston['medv']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train))

# fitted model and summary
ols_sm_results = ols_sm.fit()
ols_sm_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.551
Model:	OLS	Adj. R-squared:	0.549
Method:	Least Squares	F-statistic:	309.0
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	2.98e-88
Time:	11:47:25	Log-Likelihood:	-1637.5
No. Observations:	506	AIC:	3281.
Df Residuals:	503	BIC:	3294.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	33.2228	0.731	45.458	0.000	31.787	34.659
lstat	-1.0321	0.048	-21.416	0.000	-1.127	-0.937
age	0.0345	0.012	2.826	0.005	0.011	0.059

Omnibus:	124.288	Durbin-Watson:	0.945
Prob(Omnibus):	0.000	Jarque-Bera (JB):	244.026
Skew:	1.362	Prob(JB):	1.02e-53
Kurtosis:	5.038	Cond. No.	201.

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

import statsmodels.formula.api as smf # R-style formula api

# ols model with intercept
ols_smf = smf.ols(formula='medv ~ lstat + age', data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.551
Model:	OLS	Adj. R-squared:	0.549
Method:	Least Squares	F-statistic:	309.0
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	2.98e-88
Time:	11:47:25	Log-Likelihood:	-1637.5
No. Observations:	506	AIC:	3281.
Df Residuals:	503	BIC:	3294.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	33.2228	0.731	45.458	0.000	31.787	34.659
lstat	-1.0321	0.048	-21.416	0.000	-1.127	-0.937
age	0.0345	0.012	2.826	0.005	0.011	0.059

Omnibus:	124.288	Durbin-Watson:	0.945
Prob(Omnibus):	0.000	Jarque-Bera (JB):	244.026
Skew:	1.362	Prob(JB):	1.02e-53
Kurtosis:	5.038	Cond. No.	201.

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

# ols model with intercept
ols_smf = smf.ols(formula=ols_formula(boston, 'medv'), data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.741
Model:	OLS	Adj. R-squared:	0.734
Method:	Least Squares	F-statistic:	108.1
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	6.72e-135
Time:	11:47:25	Log-Likelihood:	-1498.8
No. Observations:	506	AIC:	3026.
Df Residuals:	492	BIC:	3085.
Df Model:	13
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.4595	5.103	7.144	0.000	26.432	46.487
crim	-0.1080	0.033	-3.287	0.001	-0.173	-0.043
zn	0.0464	0.014	3.382	0.001	0.019	0.073
indus	0.0206	0.061	0.334	0.738	-0.100	0.141
chas	2.6867	0.862	3.118	0.002	0.994	4.380
nox	-17.7666	3.820	-4.651	0.000	-25.272	-10.262
rm	3.8099	0.418	9.116	0.000	2.989	4.631
age	0.0007	0.013	0.052	0.958	-0.025	0.027
dis	-1.4756	0.199	-7.398	0.000	-1.867	-1.084
rad	0.3060	0.066	4.613	0.000	0.176	0.436
tax	-0.0123	0.004	-3.280	0.001	-0.020	-0.005
ptratio	-0.9527	0.131	-7.283	0.000	-1.210	-0.696
black	0.0093	0.003	3.467	0.001	0.004	0.015
lstat	-0.5248	0.051	-10.347	0.000	-0.624	-0.425

Omnibus:	178.041	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	783.126
Skew:	1.521	Prob(JB):	8.84e-171
Kurtosis:	8.281	Cond. No.	1.51e+04

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# variance inflation factors
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# don't forget to add constant if the ols model includes intercept
boston_exog = sm.add_constant(boston.drop('medv', axis=1))

# too fancy for printing results?
for i, col in enumerate(boston_exog.columns):
    if col == 'const':
        pass
    elif len(col) > 6:
        print(col, ':', "{0:.2f}".format(vif(boston_exog.values, i)))
    else:
        print(col, '\t:', "{0:.2f}".format(vif(boston_exog.values, i)))

crim 	: 1.79
zn 	: 2.30
indus 	: 3.99
chas 	: 1.07
nox 	: 4.39
rm 	: 1.93
age 	: 3.10
dis 	: 3.96
rad 	: 7.48
tax 	: 9.01
ptratio : 1.80
black 	: 1.35
lstat 	: 2.94

# ols model with intercept
ols_smf = smf.ols(formula=ols_formula(boston, 'medv', 'age'), data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.741
Model:	OLS	Adj. R-squared:	0.734
Method:	Least Squares	F-statistic:	117.3
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	6.08e-136
Time:	11:47:25	Log-Likelihood:	-1498.8
No. Observations:	506	AIC:	3024.
Df Residuals:	493	BIC:	3079.
Df Model:	12
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.4369	5.080	7.172	0.000	26.456	46.418
crim	-0.1080	0.033	-3.290	0.001	-0.173	-0.043
zn	0.0463	0.014	3.404	0.001	0.020	0.073
indus	0.0206	0.061	0.335	0.738	-0.100	0.141
chas	2.6890	0.860	3.128	0.002	1.000	4.378
nox	-17.7135	3.679	-4.814	0.000	-24.943	-10.484
rm	3.8144	0.408	9.338	0.000	3.012	4.617
dis	-1.4786	0.191	-7.757	0.000	-1.853	-1.104
rad	0.3058	0.066	4.627	0.000	0.176	0.436
tax	-0.0123	0.004	-3.283	0.001	-0.020	-0.005
ptratio	-0.9522	0.130	-7.308	0.000	-1.208	-0.696
black	0.0093	0.003	3.481	0.001	0.004	0.015
lstat	-0.5239	0.048	-10.999	0.000	-0.617	-0.430

Omnibus:	178.343	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	786.386
Skew:	1.523	Prob(JB):	1.73e-171
Kurtosis:	8.294	Cond. No.	1.48e+04

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.48e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

3.6.4 Interaction Terms#

# ols model with intercept
ols_smf = smf.ols(formula='medv ~ lstat * age', data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.556
Model:	OLS	Adj. R-squared:	0.553
Method:	Least Squares	F-statistic:	209.3
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	4.86e-88
Time:	11:47:25	Log-Likelihood:	-1635.0
No. Observations:	506	AIC:	3278.
Df Residuals:	502	BIC:	3295.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.0885	1.470	24.553	0.000	33.201	38.976
lstat	-1.3921	0.167	-8.313	0.000	-1.721	-1.063
age	-0.0007	0.020	-0.036	0.971	-0.040	0.038
lstat:age	0.0042	0.002	2.244	0.025	0.001	0.008

Omnibus:	135.601	Durbin-Watson:	0.965
Prob(Omnibus):	0.000	Jarque-Bera (JB):	296.955
Skew:	1.417	Prob(JB):	3.29e-65
Kurtosis:	5.461	Cond. No.	6.88e+03

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.88e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

3.6.5 Non-linear Transformations of Predictors#

# ols model with intercept
ols_smf = smf.ols(formula='medv ~ lstat + np.power(lstat, 2)', data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.641
Model:	OLS	Adj. R-squared:	0.639
Method:	Least Squares	F-statistic:	448.5
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	1.56e-112
Time:	11:47:25	Log-Likelihood:	-1581.3
No. Observations:	506	AIC:	3169.
Df Residuals:	503	BIC:	3181.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	42.8620	0.872	49.149	0.000	41.149	44.575
lstat	-2.3328	0.124	-18.843	0.000	-2.576	-2.090
np.power(lstat, 2)	0.0435	0.004	11.628	0.000	0.036	0.051

Omnibus:	107.006	Durbin-Watson:	0.921
Prob(Omnibus):	0.000	Jarque-Bera (JB):	228.388
Skew:	1.128	Prob(JB):	2.55e-50
Kurtosis:	5.397	Cond. No.	1.13e+03

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# anova of the two models
ols_smf = smf.ols(formula='medv ~ lstat', data=boston).fit()
ols_smf2 = smf.ols(formula='medv ~ lstat + np.power(lstat, 2)', data=boston).fit()

sm.stats.anova_lm(ols_smf, ols_smf2)

	df_resid	ssr	df_diff	ss_diff	F	Pr(>F)
0	504.00	19,472.38	0.00	NaN	NaN	NaN
1	503.00	15,347.24	1.00	4,125.14	135.20	0.00

f, axes = plt.subplots(4, 2, sharex=False, sharey=False)
f.set_figheight(20)

for axis, order in ((axes[0,0], 1), (axes[0,1], 2), (axes[1,0], 3), (axes[1,1], 5)):
    sns.regplot('lstat', 'medv', data=boston, ax=axis, order=order, line_kws={'color': 'gray'}, scatter_kws={'alpha': 0.5})

for axis, order in ((axes[2,0], 1), (axes[2,1], 2), (axes[3,0], 3), (axes[3,1], 5)):
    sns.residplot('lstat', 'medv', data=boston, ax=axis, order=order, line_kws={'color': 'gray'}, scatter_kws={'alpha': 0.5})

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[20], line 5
      2 f.set_figheight(20)
      4 for axis, order in ((axes[0,0], 1), (axes[0,1], 2), (axes[1,0], 3), (axes[1,1], 5)):
----> 5     sns.regplot('lstat', 'medv', data=boston, ax=axis, order=order, line_kws={'color': 'gray'}, scatter_kws={'alpha': 0.5})
      7 for axis, order in ((axes[2,0], 1), (axes[2,1], 2), (axes[3,0], 3), (axes[3,1], 5)):
      8     sns.residplot('lstat', 'medv', data=boston, ax=axis, order=order, line_kws={'color': 'gray'}, scatter_kws={'alpha': 0.5})

TypeError: regplot() got multiple values for argument 'data'

../../_images/f932022ea6de4619c98057e77aab58ea477acf08095d73f557b6de9a6bcf9833.png

# polynomial ols model with intercept
# numpy way
#ols_smf = smf.ols(formula='medv ~ lstat + np.power(lstat, 2) + np.power(lstat, 3) + np.power(lstat, 4) + np.power(lstat, 5)',
#                  data=boston)
# patsy way
ols_smf = smf.ols(formula='medv ~ lstat + I(lstat**2) + I(lstat**3) + I(lstat**4) + I(lstat**5)',
                  data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.682
Model:	OLS	Adj. R-squared:	0.679
Method:	Least Squares	F-statistic:	214.2
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	8.73e-122
Time:	11:47:26	Log-Likelihood:	-1550.6
No. Observations:	506	AIC:	3113.
Df Residuals:	500	BIC:	3139.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	67.6997	3.604	18.783	0.000	60.618	74.781
lstat	-11.9911	1.526	-7.859	0.000	-14.989	-8.994
I(lstat ** 2)	1.2728	0.223	5.703	0.000	0.834	1.711
I(lstat ** 3)	-0.0683	0.014	-4.747	0.000	-0.097	-0.040
I(lstat ** 4)	0.0017	0.000	4.143	0.000	0.001	0.003
I(lstat ** 5)	-1.632e-05	4.42e-06	-3.692	0.000	-2.5e-05	-7.63e-06

Omnibus:	144.085	Durbin-Watson:	0.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	494.545
Skew:	1.292	Prob(JB):	4.08e-108
Kurtosis:	7.096	Cond. No.	1.37e+08

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.37e+08. This might indicate that there are
strong multicollinearity or other numerical problems.

# polynomial ols model with intercept
ols_smf = smf.ols(formula='medv ~ np.log(rm)', data=boston)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.436
Model:	OLS	Adj. R-squared:	0.435
Method:	Least Squares	F-statistic:	389.3
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	1.22e-64
Time:	11:47:26	Log-Likelihood:	-1695.4
No. Observations:	506	AIC:	3395.
Df Residuals:	504	BIC:	3403.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-76.4878	5.028	-15.213	0.000	-86.366	-66.610
np.log(rm)	54.0546	2.739	19.732	0.000	48.672	59.437

Omnibus:	117.102	Durbin-Watson:	0.681
Prob(Omnibus):	0.000	Jarque-Bera (JB):	584.336
Skew:	0.916	Prob(JB):	1.30e-127
Kurtosis:	7.936	Cond. No.	38.9

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

sns.regplot('rm', 
            'medv', 
            data=boston, 
            logx=True, 
            line_kws={'color': 'gray'}, 
            scatter_kws={'alpha': 0.5});

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 sns.regplot('rm', 
      2             'medv', 
      3             data=boston, 
      4             logx=True, 
      5             line_kws={'color': 'gray'}, 
      6             scatter_kws={'alpha': 0.5});

TypeError: regplot() got multiple values for argument 'data'

3.6.6 Qualitative Predictors#

carseats = pd.read_csv('../datasets/Carseats.csv', index_col=0)
carseats

	Sales	CompPrice	Income	Advertising	Population	Price	ShelveLoc	Age	Education	Urban	US
1	9.50	138	73	11	276	120	Bad	42	17	Yes	Yes
2	11.22	111	48	16	260	83	Good	65	10	Yes	Yes
3	10.06	113	35	10	269	80	Medium	59	12	Yes	Yes
4	7.40	117	100	4	466	97	Medium	55	14	Yes	Yes
5	4.15	141	64	3	340	128	Bad	38	13	Yes	No
...	...	...	...	...	...	...	...	...	...	...	...
396	12.57	138	108	17	203	128	Good	33	14	Yes	Yes
397	6.14	139	23	3	37	120	Medium	55	11	No	Yes
398	7.41	162	26	12	368	159	Medium	40	18	Yes	Yes
399	5.94	100	79	7	284	95	Bad	50	12	Yes	Yes
400	9.71	134	37	0	27	120	Good	49	16	Yes	Yes

400 rows × 11 columns

# ols model with intercept

form = ols_formula(carseats, 'Sales', 'ShelveLoc') + ' + Income:Advertising + Price:Age + C(ShelveLoc)'
ols_smf = smf.ols(formula=form, data=carseats)

# fitted model and summary
ols_smf_results = ols_smf.fit()
ols_smf_results.summary()

OLS Regression Results
Dep. Variable:	Sales	R-squared:	0.876
Model:	OLS	Adj. R-squared:	0.872
Method:	Least Squares	F-statistic:	210.0
Date:	Sat, 11 Nov 2023	Prob (F-statistic):	6.14e-166
Time:	11:47:26	Log-Likelihood:	-564.67
No. Observations:	400	AIC:	1157.
Df Residuals:	386	BIC:	1213.
Df Model:	13
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	6.5756	1.009	6.519	0.000	4.592	8.559
Urban[T.Yes]	0.1402	0.112	1.247	0.213	-0.081	0.361
US[T.Yes]	-0.1576	0.149	-1.058	0.291	-0.450	0.135
C(ShelveLoc)[T.Good]	4.8487	0.153	31.724	0.000	4.548	5.149
C(ShelveLoc)[T.Medium]	1.9533	0.126	15.531	0.000	1.706	2.201
CompPrice	0.0929	0.004	22.567	0.000	0.085	0.101
Income	0.0109	0.003	4.183	0.000	0.006	0.016
Advertising	0.0702	0.023	3.107	0.002	0.026	0.115
Population	0.0002	0.000	0.433	0.665	-0.001	0.001
Price	-0.1008	0.007	-13.549	0.000	-0.115	-0.086
Age	-0.0579	0.016	-3.633	0.000	-0.089	-0.027
Education	-0.0209	0.020	-1.063	0.288	-0.059	0.018
Income:Advertising	0.0008	0.000	2.698	0.007	0.000	0.001
Price:Age	0.0001	0.000	0.801	0.424	-0.000	0.000

Omnibus:	1.281	Durbin-Watson:	2.047
Prob(Omnibus):	0.527	Jarque-Bera (JB):	1.147
Skew:	0.129	Prob(JB):	0.564
Kurtosis:	3.050	Cond. No.	1.31e+05

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

from patsy.contrasts import Treatment

Treatment(reference=0).code_without_intercept(list(carseats['ShelveLoc'].unique())).matrix

array([[0., 0.],
       [1., 0.],
       [0., 1.]])

3.6.7 Writing Functions#

come on…

Lab 3.6: Linear Regression

Contents

Lab 3.6: Linear Regression#

3.6.2 Simple Linear Regression#

Using scikit-learn#

Using statsmodels#

3.6.3 Multiple Linear Regression#

3.6.4 Interaction Terms#

3.6.5 Non-linear Transformations of Predictors#

3.6.6 Qualitative Predictors#

3.6.7 Writing Functions#