import os
from google.colab import files
= '04_data_analysis'
MODULE = 'vitamin_trial.csv'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join('data', DATASET)
DATASET_PATH
try:
if not os.path.exists(BASE_PATH):
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
os.chdir(MODULE_PATH)if os.path.exists(DATASET_PATH):
print(f'Dataset found: {DATASET_PATH} ✅')
else:
raise FileNotFoundError('Dataset missing after clone.')
except Exception as e:
print(f'Cloning failed: {e}')
print('Upload dataset manually...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(DATASET_PATH, 'wb') as f:
f.write(uploaded[DATASET])
📉 4.5 Regression Modelling
Regression models let us quantify relationships and make predictions. In this notebook we’ll move from simple OLS to multivariable, non-linear, and regularised models, add diagnostics, and finish with bootstrapping for robust uncertainty.
🎯 Objectives
- Build linear regression models (OLS, sklearn).
- Add categorical predictors and interactions.
- Extend to non-linear forms (poly, splines, random forest).
- Use regularisation (Ridge/Lasso) for shrinkage.
- Run diagnostics (residuals, heteroskedasticity, influence).
- Apply bootstrapping to quantify uncertainty.
Fun Fact
Regression is like a hippo guessing snack size: past patterns guide future meals 🦛%pip install -q pandas numpy seaborn matplotlib statsmodels scikit-learn patsy
import pandas as pd, numpy as np
import seaborn as sns, matplotlib.pyplot as plt
import statsmodels.api as sm, statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set_theme()print('Regression environment ready.')
1) Load Data
We’ll use vitamin_trial.csv
(simulated).
= pd.read_csv('data/vitamin_trial.csv')
df print(df.shape)
display(df.head())
2) Simple OLS Regression
Predict Vitamin D from Time using OLS.
= smf.ols('Vitamin_D ~ Time', data=df).fit()
m1 print(m1.summary())
3) Multivariable & Interactions
Add Group and its interaction with Time.
if {'Group','Time'}.issubset(df.columns):
= smf.ols('Vitamin_D ~ Time * C(Group)', data=df).fit()
m2 print(m2.summary())
4) sklearn Pipelines
Fit models with preprocessing pipelines.
= df[['Time']]
X = df['Vitamin_D']
y = train_test_split(X,y,test_size=0.3,random_state=1)
X_tr, X_te, y_tr, y_te = make_pipeline(StandardScaler(), LinearRegression())
pipe
pipe.fit(X_tr, y_tr)print('R² test:', r2_score(y_te, pipe.predict(X_te)))
5) Non-linear Models
- Polynomial regression
- Splines
- Random Forest
# Polynomial
= make_pipeline(PolynomialFeatures(3), LinearRegression())
poly
poly.fit(X_tr,y_tr)print('Poly R²:', r2_score(y_te, poly.predict(X_te)))
# Random forest
= RandomForestRegressor(n_estimators=200, random_state=1)
rf
rf.fit(X_tr,y_tr)print('RF R²:', r2_score(y_te, rf.predict(X_te)))
6) Regularisation
Ridge and Lasso shrink coefficients, useful for multicollinearity.
= pd.get_dummies(df[['Time','Group']], drop_first=True)
Xd = df['Vitamin_D']
yd
= RidgeCV(alphas=[0.1,1,10]).fit(Xd,yd)
ridge = LassoCV(cv=5).fit(Xd,yd)
lasso print('Ridge coeffs:', ridge.coef_)
print('Lasso coeffs:', lasso.coef_)
7) Diagnostics
- Residual plots
- Breusch–Pagan test
- VIF
- Influence (Cook’s D)
= m2.resid if 'm2' in globals() else m1.resid
resid = m2.fittedvalues if 'm2' in globals() else m1.fittedvalues
fitted
plt.scatter(fitted,resid)0,color='red')
plt.axhline('Residuals vs Fitted')
plt.title(
plt.show()
= het_breuschpagan(resid, m2.model.exog if 'm2' in globals() else m1.model.exog)
lm, lm_p, fval, f_p print('Breusch–Pagan p=', lm_p)
# VIF
= m2.model.exog if 'm2' in globals() else m1.model.exog
X_exog = [variance_inflation_factor(X_exog,i) for i in range(X_exog.shape[1])]
vifs print('VIFs:', vifs)
# Influence
= m2.get_influence() if 'm2' in globals() else m1.get_influence()
infl print('Max Cook D:', infl.cooks_distance[0].max())
8) Bootstrapping for Uncertainty
Pairs bootstrap for coefficient CIs and performance.
42)
np.random.seed(=500
B=[]
paramsfor b in range(B):
=df.sample(len(df),replace=True)
boottry:
=smf.ols('Vitamin_D ~ Time + C(Group)',data=boot).fit()
m
params.append(m.params)except:
continue
=pd.DataFrame(params)
coef_df=coef_df.quantile([0.025,0.975]).T
ci=smf.ols('Vitamin_D ~ Time + C(Group)',data=df).fit().params
est=pd.concat([est,ci],axis=1)
boot_summary=['estimate','low','high']
boot_summary.columns display(boot_summary)
=[]
r2sfor b in range(B):
=df.sample(len(df),replace=True)
boot=pd.get_dummies(boot[['Time','Group']],drop_first=True)
Xb=boot['Vitamin_D']
yb=LinearRegression().fit(Xb,yb)
mdl
r2s.append(mdl.score(Xb,yb))print('Bootstrap R² CI:', np.percentile(r2s,[2.5,97.5]))
✅ Conclusion
- OLS, multivariable and interactions, pipelines.
- Non-linear (poly, RF), regularisation.
- Diagnostics for assumptions.
- Bootstrapping for robust CIs and performance.
👉 Next: 4.6 Logistic Regression & Survival Analysis