# Colab setup: clone repo and locate data
import os
from google.colab import files
= '05_advanced'
MODULE = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH
# We'll read hippo nutrients from the data-handling module
= '03_data_handling'
DATA_MODULE = os.path.join(BASE_PATH, 'notebooks', DATA_MODULE, 'data', 'hippo_nutrients.csv')
NUTRIENTS_PATH
try:
print('Attempting to clone repository...')
if not os.path.exists(BASE_PATH):
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
print('Setting working directory...')
os.chdir(MODULE_PATH)if os.path.exists(NUTRIENTS_PATH):
print(f'Dataset found: {NUTRIENTS_PATH} ✅')
else:
raise FileNotFoundError('hippo_nutrients.csv missing after clone.')
except Exception as e:
print(f'Cloning failed: {e}')
print('You can upload hippo_nutrients.csv manually to notebooks/03_data_handling/data/.')
🧠 5.3 Advanced Bayesian Modelling
In this notebook we go beyond basic Bayes and build hierarchical (multilevel) models for nutrition data. You’ll see how partial pooling stabilises estimates across groups, how robust likelihoods handle outliers, and how to do prior/posterior predictive checks, diagnostics, and model comparison.
You will: - Fit hierarchical models with varying intercepts (and optional slopes). - Compare Normal vs Student-t likelihoods for robustness. - Run prior predictive and posterior predictive checks. - Inspect diagnostics (R-hat, ESS) and compare models via WAIC/LOO. - Use non-centred parameterisation for better sampling.
When to use hierarchical Bayes?
When you have repeated measures, sites/years/sexes/diets, or any natural grouping where you want to share strength across groups without assuming they’re identical.%pip install -q pandas numpy pymc arviz matplotlib
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import pymc as pm, arviz as az
'display.max_columns', 40)
pd.set_option(print('Bayesian environment ready.')
📥 Load & Prepare Data
We’ll model Iron intake as a function of Sex (F/M) and Year (2024/2025), using partial pooling across groups (Sex × Year). Feel free to switch to Calcium/Vitamin_D later.
= '../03_data_handling/data/hippo_nutrients.csv' # relative to this notebook
NUTRIENTS_PATH = pd.read_csv(NUTRIENTS_PATH)
df = df.dropna(subset=['Nutrient', 'Value', 'Sex', 'Year'])
df = df[df['Nutrient']=='Iron'].copy()
df_iron 'Sex'] = df_iron['Sex'].astype('category')
df_iron['Year'] = df_iron['Year'].astype('category')
df_iron['group'] = (df_iron['Sex'].astype(str) + '_' + df_iron['Year'].astype(str)).astype('category')
df_iron[print(df_iron[['ID','Nutrient','Year','Sex','Value']].head())
print('\nGroups:', df_iron['group'].cat.categories.tolist())
print('Rows:', len(df_iron))
🎯 Prior Predictive Check
Before seeing data, do our priors imply plausible values? We’ll set weakly-informative priors and simulate from the prior predictive.
# Assumes df_iron exists with columns: 'Value' (numeric), 'group' (categorical)
= df_iron['Value'].values
y = df_iron['group'].cat.codes.values
group_idx = df_iron['group'].cat.categories.size
n_groups
with pm.Model() as prior_model:
= pm.Normal('mu_global', mu=8, sigma=2) # prior mean around ~8
mu_global = pm.HalfNormal('tau_group', sigma=1) # group SD
tau_group = pm.Normal('mu_group', mu=mu_global, sigma=tau_group, shape=n_groups)
mu_group = pm.HalfNormal('sigma', sigma=1)
sigma
# For prior predictive draws, DO NOT pass observed
= pm.Normal('y_like', mu=mu_group[group_idx], sigma=sigma, shape=y.shape[0])
y_like
= pm.sample_prior_predictive(
prior_pred =1000,
samples=42,
random_seed=["y_like"],
var_names=False, # return a plain dict for simple access
return_inferencedata
)
# prior_pred["y_like"] has shape (samples, n_obs); flatten for a pooled prior predictive density
= np.asarray(prior_pred["y_like"]).ravel()
ypp
='kde')
az.plot_dist(ypp, kind'Prior Predictive: Iron (Value)')
plt.title('Value')
plt.xlabel( plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 2 1 # Assumes df_iron exists with columns: 'Value' (numeric), 'group' (categorical) ----> 2 y = df_iron['Value'].values 3 group_idx = df_iron['group'].cat.codes.values 4 n_groups = df_iron['group'].cat.categories.size NameError: name 'df_iron' is not defined
🏗️ Hierarchical Model (Normal likelihood)
Partial pooling of group means (Sex×Year). This stabilises small groups by sharing information.
- Global mean
mu_global
- Group deviations
mu_group ~ Normal(mu_global, tau_group)
- Observation noise
sigma
Tip: non-centred parameterisation
We’ll use a non-centred form to help sampling when groups are weakly informed:mu_group = mu_global + z * tau_group
, z ~ Normal(0,1)
.
= df_iron['Value'].values
y_raw = y_raw.mean(), y_raw.std()
y_mean, y_std = (y_raw - y_mean) / y_std
y
= df_iron['group'].cat.codes.values
group_idx = {'group': df_iron['group'].cat.categories.tolist()}
coords
with pm.Model(coords=coords) as hier_normal:
'group_idx', group_idx)
pm.Data(
# On standardised scale:
= pm.Normal('mu_global', mu=0.0, sigma=1.0) # prior mean near 0 on z-scale
mu_global = pm.HalfNormal('tau_group', sigma=0.5) # between-group SD (z-scale)
tau_group = pm.Normal('z', mu=0.0, sigma=1.0, dims='group')
z = pm.Deterministic('mu_group', mu_global + z * tau_group, dims='group')
mu_group
= pm.HalfNormal('sigma', sigma=0.5) # within-group SD (z-scale)
sigma
= pm.Normal('y_obs', mu=mu_group[group_idx], sigma=sigma, observed=y)
y_obs
= pm.sample(
idata_normal =2000,
draws=3000,
tune=0.95, # smaller steps; reduce divergences
target_accept=4,
chains=42,
random_seed=True
return_inferencedata
)
= pm.sample_posterior_predictive(idata_normal, random_seed=42, var_names=['y_obs','mu_group'])
ppc_normal
# --- Diagnostics ---
print("Total divergences:", int(idata_normal.sample_stats['diverging'].sum()))
; plt.show()
az.plot_energy(idata_normal)=['mu_global','tau_group','sigma']); plt.show()
az.plot_rank(idata_normal, var_names
# --- Summary on z-scale ---
=['mu_global','tau_group','sigma','mu_group'], round_to=2))
display(az.summary(idata_normal, var_names
# --- Optional: back-transform key parameters to original units ---
# mu_global_back = mu_global_z * y_std + y_mean
= idata_normal.posterior['mu_global'].values * y_std + y_mean
mu_global_samples = idata_normal.posterior['sigma'].values * y_std
sigma_samples = idata_normal.posterior['tau_group'].values * y_std
tau_group_samples = idata_normal.posterior['mu_group'].values * y_std + y_mean
mu_group_samples
print("Posterior mean (original units):")
print(" mu_global ≈", mu_global_samples.mean())
print(" sigma ≈", sigma_samples.mean())
print(" tau_group ≈", tau_group_samples.mean())
Posterior Predictive Check (Normal)
Compare simulated y_rep
to observed y
. Look for systematic mismatches (spread, tails, skew).
with hier_normal:
= pm.sample_posterior_predictive(
ppc_normal
idata_normal,=["y_obs"],
var_names=42,
random_seed=True,
return_inferencedata
)
={"y_obs": "y_obs"})
az.plot_ppc(ppc_normal, data_pairs"PPC: Hierarchical Normal")
plt.title( plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 with hier_normal: 2 ppc_normal = pm.sample_posterior_predictive( 3 idata_normal, 4 var_names=["y_obs"], 5 random_seed=42, 6 return_inferencedata=True, 7 ) 9 az.plot_ppc(ppc_normal, data_pairs={"y_obs": "y_obs"}) NameError: name 'hier_normal' is not defined
🧱 Robust Hierarchical Model (Student-t likelihood)
Outliers/severe tails? Use Student-t with unknown degrees of freedom ν
.
y_obs ~ StudentT(ν, mu_group[group_idx], sigma)
ν
with weakly informative prior (e.g., Exponential) to allow heavy tails when needed.
Why t?
It down-weights extreme observations relative to Normal, often yielding more stable inferences in messy nutrition data.with pm.Model(coords=coords) as hier_t:
'group_idx', group_idx)
pm.Data(
= pm.Normal('mu_global', mu=8, sigma=2)
mu_global = pm.HalfNormal('tau_group', sigma=1)
tau_group = pm.Normal('z', mu=0, sigma=1, dims='group')
z = pm.Deterministic('mu_group', mu_global + z * tau_group, dims='group')
mu_group = pm.HalfNormal('sigma', sigma=1)
sigma = pm.Exponential('nu', lam=1/10) # mean 10, reasonably heavy tails allowed
nu
= pm.StudentT('y_obs', nu=nu, mu=mu_group[group_idx], sigma=sigma, observed=y)
y_obs
= pm.sample(1000, tune=1000, target_accept=0.9, chains=4, random_seed=42, return_inferencedata=True)
idata_t = pm.sample_posterior_predictive(idata_t, random_seed=42)
ppc_t
=['mu_global','tau_group','sigma','nu','mu_group'], round_to=2) az.summary(idata_t, var_names
with hier_t:
= pm.sample_posterior_predictive(
ppc_normal
idata_normal,=["y_obs"],
var_names=42,
random_seed=True,
return_inferencedata
)
={"y_obs": "y_obs"})
az.plot_ppc(ppc_normal, data_pairs"PPC: Hierarchical Normal")
plt.title( plt.show()
🧪 Diagnostics & Model Comparison
Check R-hat (~1.00), ESS (large), and compare models by WAIC/LOO. Lower is better (penalises complexity).
# --- helper: ensure log_likelihood is attached (PyMC≥5 API) ---
def ensure_loglik(idata, model):
if hasattr(idata, "log_likelihood"):
return idata
with model:
= pm.compute_log_likelihood(idata)
ll = idata.copy()
out
out.extend(ll)return out
# Attach log-likelihoods if needed
= ensure_loglik(idata_normal, hier_normal)
idata_normal_ll = ensure_loglik(idata_t, hier_t)
idata_t_ll
# Choose variables to report
= ["mu_global", "tau_group", "sigma", "mu_group"]
vars_normal = ["mu_global", "tau_group", "sigma", "nu", "mu_group"]
vars_t
# --- R-hat & ESS tables ---
print("R-hat & ESS — Normal model")
= az.summary(idata_normal_ll, var_names=vars_normal, round_to=2)
summ_norm "r_hat", "ess_bulk", "ess_tail"]])
display(summ_norm[[
print("\nR-hat & ESS — Student-t model")
= az.summary(idata_t_ll, var_names=vars_t, round_to=2)
summ_t "r_hat", "ess_bulk", "ess_tail"]])
display(summ_t[[
# If you want arrays directly:
# rhat_norm = az.rhat(idata_normal_ll)
# essb_norm = az.ess(idata_normal_ll, method="bulk")
# esst_norm = az.ess(idata_normal_ll, method="tail")
# --- Model comparison (lower is better) ---
print("\nModel comparison (WAIC, BB-pseudo-BMA)")
= az.compare(
cmp_waic "normal": idata_normal_ll, "student_t": idata_t_ll},
{="waic",
ic="BB-pseudo-BMA",
method
)
display(cmp_waic)
print("\nModel comparison (LOO, BB-pseudo-BMA)")
= az.compare(
cmp_loo "normal": idata_normal_ll, "student_t": idata_t_ll},
{="loo",
ic="BB-pseudo-BMA",
method
) display(cmp_loo)
🎯 Group Contrasts (Posterior Differences)
You often want contrasts like Treatment vs Control or F_2025 − F_2024. We’ll compute differences between selected mu_group
levels from the robust model.
= df_iron['group'].cat.categories.tolist()
groups print('Groups:', groups)
= {g:i for i,g in enumerate(groups)}
gmap
= idata_t.posterior['mu_group'] # dims: chain, draw, group
mu_post
# Example contrasts: (F_2025 - F_2024) and (M_2025 - M_2024) if present
def posterior_diff(mu, g1, g0):
return (mu.sel(group=g1) - mu.sel(group=g0)).stack(sample=('chain','draw')).values
= {}
contrast_results if 'F_2025' in groups and 'F_2024' in groups:
= posterior_diff(mu_post, 'F_2025', 'F_2024')
d 'F_2025 - F_2024'] = d
contrast_results[if 'M_2025' in groups and 'M_2024' in groups:
= posterior_diff(mu_post, 'M_2025', 'M_2024')
d 'M_2025 - M_2024'] = d
contrast_results[
for name, d in contrast_results.items():
=0)
az.plot_posterior(d, ref_valf'Posterior Contrast: {name}')
plt.title( plt.show()
📈 (Optional) Varying Intercepts & Slopes
If you have a continuous covariate (e.g., Age or BodyWeight_kg), allow group-specific slopes:
mu_group = mu_global + z_inter[group]*tau_inter
beta_group = beta_global + z_slope[group]*tau_slope
y ~ Normal(mu_group[group_idx] + beta_group[group_idx]*x, sigma)
Learn more
- PyMC docs: https://www.pymc.io/
- ArviZ model comparison: https://python.arviz.org/
🧩 Exercises
- Switch nutrient: Fit the same hierarchical models for Calcium. Compare WAIC/LOO to decide between Normal vs Student-t.
- Add covariate: If your dataset has
Age
orBodyWeight_kg
, build a varying-slopes model. Does partial pooling shrink extreme slopes? - Prior sensitivity: Widen/narrow
tau_group
prior and re-run. How do group means change? - Predict new group: Add a new year with few observations and inspect how partial pooling stabilises its estimate.
✅ Wrap-up
You built hierarchical models, used robust likelihoods, checked priors/posteriors, validated diagnostics, and compared models. These tools generalise well to multi-site nutrition studies, repeated measures, and small-sample subgroups.
Further reading
- Gelman et al., Bayesian Data Analysis
- PyMC docs: https://www.pymc.io/
- ArviZ: https://python.arviz.org/