# Setup for Google Colab: fetch dataset automatically or allow manual upload
import os
from google.colab import files
= '04_data_analysis'
MODULE = 'vitamin_trial.csv'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join('data', DATASET)
DATASET_PATH
try:
print('Attempting to clone repository...')
if not os.path.exists(BASE_PATH):
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
print('Setting working directory...')
os.chdir(MODULE_PATH)if os.path.exists(DATASET_PATH):
print(f'Dataset found: {DATASET_PATH} â
')
else:
raise FileNotFoundError('Dataset missing after clone.')
except Exception as e:
print(f'Cloning failed: {e}')
print('Falling back to manual upload...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(DATASET_PATH, 'wb') as f:
f.write(uploaded[DATASET])print(f'Successfully uploaded {DATASET} â
')
else:
raise FileNotFoundError(f'Upload failed. Please upload {DATASET}.')
đ§Ș 4.4 Statistical Testing â From Questions to Evidence
In this notebook youâll practice hypothesis testing on vitamin_trial.csv
. Weâll start with assumption checks, then run parametric and non-parametric tests, include effect sizes with confidence intervals, and handle multiple comparisons.
đŻ Objectives
- Choose the right test for a research question (one-sample, two-sample, paired, one-way ANOVA).
- Check assumptions (approx. normality, variance homogeneity) and use non-parametric alternatives when needed.
- Report p-values and effect sizes (Cohenâs d, Hedgesâ g) with 95% CI.
- Handle multiple comparisons (Holm or FDR/BenjaminiâHochberg).
Quick refresher (click)
- One-sample: compare a mean to a reference value.
- Two-sample (independent): compare means of two groups (Welch default for unequal variances).
- Paired: compare means of before/after within the same ID.
- One-way ANOVA: compare means across â„3 groups (plus post-hoc with multiplicity control).
- Non-parametric: Wilcoxon signed-rank (paired), MannâWhitney U (independent), KruskalâWallis (â„3 groups).
%pip install -q pandas numpy scipy statsmodels seaborn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests
'display.max_columns', 50)
pd.set_option(
sns.set_theme()print('Statistical testing environment ready.')
1) Load & Inspect
We load the dataset and check what we have. Typical columns: ID
, Group
(Control/Treatment), Vitamin_D
(numeric), Time
(e.g., 0/1), and possibly Outcome
.
= pd.read_csv('data/vitamin_trial.csv')
df print('Shape:', df.shape)
display(df.head())='all')) display(df.describe(include
2) Assumption Checks (lightweight)
Parametric tests (t-tests/ANOVA) typically assume roughly normal residuals and equal variances (for some tests). We inspect normality with ShapiroâWilk and equality of variances with Levene.
Good practice
- Use visuals (see 4.1/4.2) first; formal tests can be overly sensitive with large n.
- If assumptions look shaky, lean on Welch t-test (default below) or non-parametrics.
# Example: normality of Vitamin_D by Group (if present)
if {'Vitamin_D','Group'}.issubset(df.columns):
for g, sub in df.groupby('Group', observed=True):
= pd.to_numeric(sub['Vitamin_D'], errors='coerce').dropna()
y if len(y) >= 3:
= stats.shapiro(y)
stat, p print(f'ShapiroâWilk Vitamin_D in Group={g}: W={stat:.3f}, p={p:.3g}, n={len(y)}')
else:
print('Columns Vitamin_D and/or Group not available for normality demo.')
# Example: equality of variances across groups with Levene (median-centered, robust-ish)
if {'Vitamin_D','Group'}.issubset(df.columns):
= [pd.to_numeric(sub['Vitamin_D'], errors='coerce').dropna() for _, sub in df.groupby('Group', observed=True)]
groups if len(groups) >= 2 and all(len(g) >= 2 for g in groups):
= stats.levene(*groups, center='median')
stat, p print(f'Levene (Vitamin_D ~ Group): W={stat:.3f}, p={p:.3g}')
else:
pass
3) Helper functions â effect sizes & CIs
Weâll compute Cohenâs d / Hedgesâ g, plus 95% CIs for mean differences using Welchâs t framework. For paired designs, we compute effect sizes on the difference scores.
def cohens_d_independent(x, y):
"""Cohen's d for independent samples (pooled SD)."""
= pd.Series(x).dropna().astype(float)
x = pd.Series(y).dropna().astype(float)
y = len(x), len(y)
nx, ny if nx < 2 or ny < 2:
return np.nan
= x.var(ddof=1), y.var(ddof=1)
sx2, sy2 = np.sqrt(((nx-1)*sx2 + (ny-1)*sy2) / (nx+ny-2))
sp if sp == 0:
return np.nan
= (x.mean() - y.mean()) / sp
d return d
def hedges_g(d, nx, ny):
"""Small-sample corrected Cohen's d (Hedges' g)."""
# J correction
= nx + ny - 2
df if df <= 0 or np.isnan(d):
return np.nan
= 1 - (3 / (4*df - 1))
J return d * J
def mean_diff_ci_welch(x, y, alpha=0.05):
"""95% CI for mean difference (x - y) using Welch's t."""
= pd.Series(x).dropna().astype(float)
x = pd.Series(y).dropna().astype(float)
y = len(x), len(y)
nx, ny = x.mean(), y.mean()
mx, my = x.var(ddof=1), y.var(ddof=1)
vx, vy if nx == 0 or ny == 0:
return (np.nan, np.nan, np.nan)
= np.sqrt(vx/nx + vy/ny)
se # WelchâSatterthwaite df
= (vx/nx + vy/ny)**2 / ((vx**2)/((nx**2)*(nx-1)) + (vy**2)/((ny**2)*(ny-1))) if nx>1 and ny>1 else np.nan
df_w if se == 0 or not np.isfinite(df_w):
return (mx-my, np.nan, np.nan)
= stats.t.ppf(1 - alpha/2, df=df_w)
tcrit = (mx-my) - tcrit*se, (mx-my) + tcrit*se
lo, hi return (mx-my, lo, hi)
def cohens_d_paired(x_pre, x_post):
"""Cohen's d for paired samples: mean(diff) / sd(diff)."""
= pd.Series(x_pre).astype(float)
pre = pd.Series(x_post).astype(float)
post # align by index (IDs)
= pre.notna() & post.notna()
m = post[m] - pre[m]
d if len(d) < 2:
return np.nan
return d.mean() / d.std(ddof=1) if d.std(ddof=1) != 0 else np.nan
def mean_diff_ci_paired(x_pre, x_post, alpha=0.05):
= pd.Series(x_pre).astype(float)
pre = pd.Series(x_post).astype(float)
post = pre.notna() & post.notna()
m = post[m] - pre[m]
d if len(d) < 2:
return (np.nan, np.nan, np.nan)
= d.mean()
md = d.std(ddof=1) / np.sqrt(len(d))
se = stats.t.ppf(1 - alpha/2, df=len(d)-1)
tcrit return (md, md - tcrit*se, md + tcrit*se)
4) One-sample tests
Question: Is mean Vitamin D (overall) different from a reference value (e.g., 12.5 ”g)?
Notes
- Parametric: one-sample t-test on the mean.
- Non-parametric: sign test or Wilcoxon signed-rank against a hypothesised median (scipyâs
wilcoxon
needs pairs, so weâll stick to t here for brevity).
if 'Vitamin_D' in df.columns:
= pd.to_numeric(df['Vitamin_D'], errors='coerce').dropna()
y = 12.5 # reference mean (change to your scientific target)
mu0 = stats.ttest_1samp(y, popmean=mu0)
tstat, pval print(f'One-sample t-test: H0: mean={mu0}')
print(f' t={tstat:.3f}, p={pval:.3g}, n={len(y)}; mean={y.mean():.3f}')
else:
print('Vitamin_D not found for one-sample test.')
5) Two-sample (independent) tests
Question: Do Control vs Treatment differ in mean Vitamin D?
- Parametric: Welchâs t-test (robust to unequal variances; default here).
- Non-parametric: MannâWhitney U.
- Report effect sizes + 95% CI for mean difference.
if {'Vitamin_D','Group'}.issubset(df.columns):
= df['Group'].dropna().unique().tolist()
g_levels if len(g_levels) == 2:
= sorted(g_levels)
g1, g2 = pd.to_numeric(df.loc[df['Group']==g1, 'Vitamin_D'], errors='coerce').dropna()
x = pd.to_numeric(df.loc[df['Group']==g2, 'Vitamin_D'], errors='coerce').dropna()
y # Welch t-test
= stats.ttest_ind(x, y, equal_var=False)
t, p = cohens_d_independent(x, y)
d = hedges_g(d, len(x), len(y))
g = mean_diff_ci_welch(x, y)
md, lo, hi print(f'Welch t-test ({g1} vs {g2}): t={t:.3f}, p={p:.3g}')
print(f' mean_diff={md:.3f} [{lo:.3f}, {hi:.3f}] (xây)')
print(f' Cohen d={d:.3f}, Hedges g={g:.3f}')
# MannâWhitney U
= stats.mannwhitneyu(x, y, alternative='two-sided')
u, p_u print(f'MannâWhitney U: U={u:.1f}, p={p_u:.3g}')
else:
print('Two-sample demo expects exactly 2 groups in Group.')
else:
print('Need Vitamin_D and Group columns for two-sample tests.')
6) Paired tests (within-ID before/after)
If your dataset has repeated measurements per ID
(e.g., Time=0
and Time=1
), use paired tests: - Parametric: paired t-test on the difference. - Non-parametric: Wilcoxon signed-rank on the difference.
Weâll try to form pairs from Time==0
vs Time==1
within each ID
for Vitamin_D
. If not available, we skip gracefully.
if {'ID','Time','Vitamin_D'}.issubset(df.columns):
# Wide-style pairing: each row is an IDĂTime; pivot to columns Time0, Time1
= df[['ID','Time','Vitamin_D']].dropna()
tmp # Only keep IDs that have BOTH time points 0 and 1
= tmp.groupby('ID')['Time'].nunique() >= 2
have_both = have_both[have_both].index
ids = tmp[tmp['ID'].isin(ids)].copy()
sub = sub.pivot_table(index='ID', columns='Time', values='Vitamin_D', aggfunc='mean')
wide if 0 in wide.columns and 1 in wide.columns:
= wide[0]; post = wide[1]
pre # Paired t-test
= pre.notna() & post.notna()
m if m.sum() >= 2:
= stats.ttest_rel(pre[m], post[m])
t, p = cohens_d_paired(pre[m], post[m])
d_pair = mean_diff_ci_paired(pre[m], post[m])
md, lo, hi print(f'Paired t-test (Time 0 vs 1): t={t:.3f}, p={p:.3g}, n={m.sum()}')
print(f' mean_diff={md:.3f} [{lo:.3f}, {hi:.3f}] (postâpre)')
print(f' Cohen d (paired)={d_pair:.3f}')
# Wilcoxon signed-rank
= stats.wilcoxon(post[m] - pre[m])
w, p_w print(f'Wilcoxon signed-rank: W={w:.1f}, p={p_w:.3g}')
else:
print('Not enough paired (non-missing) observations for paired tests.')
else:
print('Could not find both Time=0 and Time=1 columns after pivot.')
else:
print('Need ID, Time, Vitamin_D for paired tests.')
7) â„3 groups â ANOVA and KruskalâWallis
Compare means across three or more groups: - Parametric: one-way ANOVA (weâll use OLS via statsmodels and Type II ANOVA table). - Non-parametric: KruskalâWallis (rank-based). For post-hoc, weâll do pairwise tests with multiplicity control.
Post-hoc choices
- Parametric: Tukey HSD is classic (assumes equal variances / balanced design). If design is unbalanced/heteroscedastic, many prefer pairwise Welch t with Holm correction.
- Non-parametric: pairwise MannâWhitney U with Holm correction is a pragmatic alternative (Dunnâs test is another option but needs extra packages).
if {'Vitamin_D','Group'}.issubset(df.columns):
# Require â„ 3 groups with some data
= df.groupby('Group', observed=True)['Vitamin_D'].apply(lambda s: pd.to_numeric(s, errors='coerce').notna().sum())
counts = counts[counts >= 2].index.tolist()
valid_groups if len(valid_groups) >= 3:
= df[df['Group'].isin(valid_groups)].copy()
dfa 'Vitamin_D'] = pd.to_numeric(dfa['Vitamin_D'], errors='coerce')
dfa[= dfa.dropna(subset=['Vitamin_D'])
dfa # OLS + ANOVA table
= smf.ols('Vitamin_D ~ C(Group)', data=dfa).fit()
model = sm.stats.anova_lm(model, typ=2)
aov print('One-way ANOVA (Type II):')
display(aov)# Tukey HSD (if desired)
try:
= pairwise_tukeyhsd(endog=dfa['Vitamin_D'], groups=dfa['Group'], alpha=0.05)
tuk print('\nTukey HSD (parametric, equal-variance assumption):')
=tuk.summary(data=False)[1:], columns=tuk.summary().data[0]))
display(pd.DataFrame(dataexcept Exception as e:
print('Tukey HSD not available:', e)
# KruskalâWallis
= [pd.to_numeric(dfa.loc[dfa['Group']==g, 'Vitamin_D'], errors='coerce').dropna() for g in valid_groups]
arrays = stats.kruskal(*arrays)
H, pH print(f'\nKruskalâWallis: H={H:.3f}, p={pH:.3g}')
# Pairwise MannâWhitney with Holm correction
= [], []
pairs, pvals for i in range(len(valid_groups)):
for j in range(i+1, len(valid_groups)):
= valid_groups[i], valid_groups[j]
g1, g2 = pd.to_numeric(dfa.loc[dfa['Group']==g1, 'Vitamin_D'], errors='coerce').dropna()
x = pd.to_numeric(dfa.loc[dfa['Group']==g2, 'Vitamin_D'], errors='coerce').dropna()
y if len(x) >= 2 and len(y) >= 2:
= stats.mannwhitneyu(x, y, alternative='two-sided')
u, p ; pvals.append(p)
pairs.append((g1, g2))if pvals:
= multipletests(pvals, alpha=0.05, method='holm')
rej, p_adj, _, _ = pd.DataFrame({'pair': pairs, 'p_raw': pvals, 'p_holm': p_adj, 'reject_0.05': rej})
out print('\nPairwise MannâWhitney U with Holm correction:')
'p_holm'))
display(out.sort_values(else:
print('Need â„3 groups with data for ANOVA/KruskalâWallis.')
else:
print('Need Vitamin_D and Group for ANOVA/KruskalâWallis.')
8) Reporting results
When reporting, include: - The test used (and why), - The estimate (mean difference or effect size) with 95% CI, - The test statistic and p-value, - Any assumption checks or transformations, - How multiple testing was handled (if applicable).
Template sentences
- âMean Vitamin D differed between Treatment and Control (Welchâs t(âdf)=t, p=p). The mean difference was md [lo, hi] ”g; Cohenâs d = d (Hedgesâ g = g).â
<li>âAcross the three groups, ANOVA indicated a difference in means (F(df1, df2)=F, p=p). Tukey HSD post-hoc found X>Y (p_adj=âŠ).â</li>
<li>âAssumptions appeared [reasonable/not met]; therefore we used [Welch/non-parametric] tests.â</li>
đ§Ș Exercises
- Set a one-sample target
- Choose a clinically meaningful Vitamin_D target (e.g., 12.5 ”g).
- Test whether the overall mean differs; report the mean ± 95% CI.
- Choose a clinically meaningful Vitamin_D target (e.g., 12.5 ”g).
- Two-sample with assumptions
- Compare groups (Control vs Treatment).
- Show Welch t-test and MannâWhitney; report d, g, mean difference CI.
- State which youâd trust if variances differ or distributions are skewed.
- Compare groups (Control vs Treatment).
- Paired change
- If Time=0/1 exist per ID, test within-ID change (paired t and Wilcoxon).
- Report mean change ± 95% CI and Cohenâs d (paired).
- If Time=0/1 exist per ID, test within-ID change (paired t and Wilcoxon).
- â„3 groups
- Run one-way ANOVA and KruskalâWallis across Group levels.
- Perform post-hoc comparisons with multiplicity control (Tukey or Holm-adjusted pairwise tests).
- Which groups differ meaningfully? Back up with CIs.
- Run one-way ANOVA and KruskalâWallis across Group levels.
- Multiplicity
- If you compare several endpoints (or many pairs), apply Holm or FDR to control false positives.
- Explain your choice briefly.
- If you compare several endpoints (or many pairs), apply Holm or FDR to control false positives.
â Conclusion
You selected appropriate tests, verified assumptions, used non-parametric alternatives when needed, and reported both p-values and effect sizes with CIs. These practices make your findings more robust and interpretable.
Further reading
- SciPy Stats: https://docs.scipy.org/doc/scipy/reference/stats.html
- Statsmodels ANOVA & OLS: https://www.statsmodels.org/
- Multiple testing: https://www.statsmodels.org/stable/generated/statsmodels.stats.multitest.multipletests.html
- Effect sizes (overview): many concise guides are available; pick one you like and stick to a consistent reporting style.