# Setup for Google Colab: Fetch datasets automatically or manually
import os
from google.colab import files
= '04_data_analysis'
MODULE = 'vitamin_trial.csv'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join('data', DATASET)
DATASET_PATH
try:
print('Attempting to clone repository...')
if not os.path.exists(BASE_PATH):
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
print('Setting working directory...')
os.chdir(MODULE_PATH)if os.path.exists(DATASET_PATH):
print(f'Dataset found: {DATASET_PATH} ✅')
else:
raise FileNotFoundError('Dataset missing after clone.')
except Exception as e:
print(f'Cloning failed: {e}')
print('Falling back to manual upload...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(DATASET_PATH, 'wb') as f:
f.write(uploaded[DATASET])print(f'Successfully uploaded {DATASET} ✅')
else:
raise FileNotFoundError(f'Upload failed. Please upload {DATASET}.')
📈 4.3 Correlation & Association
In this notebook we quantify associations in vitamin_trial.csv
. You’ll compute classic correlations (Pearson, Spearman, Kendall), handle binary vs numeric with point-biserial, and add bootstrap confidence intervals, a permutation test, a simple partial correlation (controlling for a categorical factor like Group), and non-linear association metrics (Mutual Information, Distance Correlation).
🎯 Objectives
- Compute Pearson (linear), Spearman (rank/monotonic), Kendall (rank/robust) correlations.
- Use point-biserial for numeric ↔︎ binary associations.
- Compare correlations with and without log transforms (when skewed).
- Quantify uncertainty with bootstrap CIs and significance with a permutation test.
- Estimate a partial correlation by controlling for a categorical factor.
- Assess non-linear association via Mutual Information and Distance Correlation.
👉 Visual overviews (pair plots, heatmaps) live in 4.2 EDA. Normality/log-shape diagnostics live in 4.1 Distributions.
Friendly reminders
- Correlation ≠causation: confounding and directionality matter.
- Linearity & outliers: Pearson assumes linearity and is outlier-sensitive; Spearman/Kendall are more robust to outliers and monotonic nonlinearity.
- Missingness: correlations silently drop NA pairs; patterned missingness can bias results.
%pip install -q pandas numpy seaborn scipy statsmodels scikit-learn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr
import statsmodels.api as sm
from sklearn.feature_selection import mutual_info_regression
'display.max_columns', 50)
pd.set_option(
sns.set_theme()print('Correlation environment ready.')
1) Load & Select Variables
We’ll load the dataset, identify numeric columns, and prepare a clean subset for correlation calculations.
= pd.read_csv('data/vitamin_trial.csv')
df print('Shape:', df.shape)
display(df.head())
# Identify numeric columns (we will use these for correlation matrices if needed)
= df.select_dtypes(include=[np.number]).copy()
num print('Numeric columns:', num.columns.tolist())
# For demos below, we try Vitamin_D and Time if present
= 'Vitamin_D' in df.columns
has_vit = 'Time' in df.columns and np.issubdtype(df['Time'].dtype, np.number) has_time
2) Core correlations (pairwise)
We start with two variables (e.g., Vitamin_D
and Time
) and compute Pearson, Spearman, and Kendall.
def corr_all(x, y):
= pd.Series(x).astype(float), pd.Series(y).astype(float)
x, y = x.notna() & y.notna()
m = x[m], y[m]
x, y if len(x) < 2:
return {'pearson_r': np.nan, 'pearson_p': np.nan,
'spearman_r': np.nan, 'spearman_p': np.nan,
'kendall_tau': np.nan, 'kendall_p': np.nan, 'n': len(x)}
= pearsonr(x, y)
r_p, p_p = spearmanr(x, y)
r_s, p_s = kendalltau(x, y)
r_k, p_k return {'pearson_r': r_p, 'pearson_p': p_p,
'spearman_r': r_s, 'spearman_p': p_s,
'kendall_tau': r_k, 'kendall_p': p_k, 'n': len(x)}
if has_vit and has_time:
= corr_all(df['Vitamin_D'], df['Time'])
res print('Vitamin_D vs Time:')
for k, v in res.items():
print(f' {k}: {v:.4g}' if isinstance(v, (float, np.floating)) else f' {k}: {v}')
else:
print('Need a numeric Time column and Vitamin_D for the pairwise demo.')
2.1 Sensitivity to log-transform (if skewed)
If Vitamin_D
is right-skewed (see 4.1), a log may clarify linear associations. We compare Pearson on raw vs log(Vitamin_D).
if has_vit and has_time:
= 1e-6
eps = pearsonr(df['Vitamin_D'], df['Time'])
r_raw, p_raw = pearsonr(np.log(df['Vitamin_D'] + eps), df['Time'])
r_log, p_log print(f'Pearson (raw): r={r_raw:.3f}, p={p_raw:.2e}')
print(f'Pearson (log Vitamin_D): r={r_log:.3f}, p={p_log:.2e}')
else:
print('Skipping log sensitivity: need Vitamin_D and numeric Time.')
3) Numeric ↔︎ Binary: Point-biserial correlation
If you have a binary categorical variable (e.g., Outcome
with two levels, or a yes/no), use point-biserial to measure association with a numeric variable (e.g., Vitamin_D
).
# Try Outcome (binary) vs Vitamin_D
if has_vit and 'Outcome' in df.columns:
= pd.Series(df['Outcome']).dropna().unique().tolist()
levels if len(levels) == 2:
= sorted(levels)
levels_sorted = {levels_sorted[0]:0, levels_sorted[1]:1}
bin_map = pd.Series(df['Outcome']).map(bin_map)
y_bin = y_bin.notna() & pd.Series(df['Vitamin_D']).notna()
m = pointbiserialr(y_bin[m].astype(int), df.loc[m, 'Vitamin_D'].astype(float))
r, p print(f'Point-biserial (Outcome vs Vitamin_D): r={r:.3f}, p={p:.2e}')
print(f'Outcome mapping: {levels_sorted[0]}→0, {levels_sorted[1]}→1')
else:
print('Outcome is not binary here; point-biserial skipped.')
else:
print('Need Vitamin_D and Outcome for point-biserial.')
4) Bootstrap confidence interval for a correlation
We’ll estimate a 95% CI for Pearson r via nonparametric bootstrap (resampling rows with replacement). This is useful when analytic CIs are awkward or assumptions are questionable.
What you get
- A distribution of r-values across bootstrap resamples.
- Percentile CI (e.g., 2.5th and 97.5th percentiles).
def bootstrap_corr(x, y, B=2000, seed=42):
= np.random.default_rng(seed)
rng = pd.Series(x).astype(float), pd.Series(y).astype(float)
x, y = x.notna() & y.notna()
m = x[m].values, y[m].values
x, y = len(x)
n if n < 2:
return np.array([])
= np.empty(B, dtype=float)
out for b in range(B):
= rng.integers(0, n, size=n)
idx = pearsonr(x[idx], y[idx])[0]
out[b] return out
if has_vit and has_time:
= bootstrap_corr(df['Vitamin_D'], df['Time'], B=2000, seed=1)
boots if boots.size:
= np.percentile(boots, [2.5, 97.5])
lo, hi print(f'Bootstrap Pearson r 95% CI: [{lo:.3f}, {hi:.3f}]')
=(6,3))
plt.figure(figsize=30, kde=True)
sns.histplot(boots, bins'Bootstrap distribution of r (Vitamin_D vs Time)')
plt.title('Pearson r')
plt.xlabel(; plt.show()
plt.tight_layout()else:
print('Skipping bootstrap CI: need Vitamin_D and numeric Time.')
5) Permutation test for correlation (non-parametric p-value)
Under the null of no association, the correlation between x and randomly permuted y should look like what we observe. We compute a permutation p-value as the fraction of permuted |r| ≥ observed |r|.
When useful?
- When parametric assumptions for Pearson p-values are doubtful.
- Quick, assumption-light check (though computationally heavier).
def perm_test_corr(x, y, B=5000, seed=123):
= np.random.default_rng(seed)
rng = pd.Series(x).astype(float), pd.Series(y).astype(float)
x, y = x.notna() & y.notna()
m = x[m].values, y[m].values
x, y if len(x) < 2:
return np.nan, np.nan
= pearsonr(x, y)[0]
r_obs = 0
cnt for b in range(B):
= rng.permutation(y)
y_perm = pearsonr(x, y_perm)[0]
r_b if abs(r_b) >= abs(r_obs):
+= 1
cnt = (cnt + 1) / (B + 1) # add-one smoothing
pval return r_obs, pval
if has_vit and has_time:
= perm_test_corr(df['Vitamin_D'], df['Time'], B=2000, seed=2)
r_obs, p_perm print(f'Permutation test: r_obs={r_obs:.3f}, p_perm={p_perm:.4f}')
else:
print('Skipping permutation test: need Vitamin_D and numeric Time.')
6) Partial correlation (control for a categorical confounder)
We estimate the association between Vitamin_D
and Time
controlling for Group
by residualising both variables on Group (dummy-coded), then correlating the residuals.
Notes
- This is a simple, common approach; for more complex designs, consider regression or SEM.
- If
Time
is not present orGroup
is missing, we skip.
def residualize(y, X):
= sm.add_constant(X, has_constant='add')
X_ = sm.OLS(y, X_, missing='drop')
model = model.fit()
res = pd.Series(np.nan, index=y.index)
resid = y.loc[res.fittedvalues.index] - res.fittedvalues
resid.loc[res.fittedvalues.index] return resid
if has_vit and has_time and 'Group' in df.columns:
= pd.get_dummies(df['Group'], drop_first=True)
G = pd.to_numeric(df['Vitamin_D'], errors='coerce')
y1 = pd.to_numeric(df['Time'], errors='coerce')
y2 = residualize(y1, G)
r1 = residualize(y2, G)
r2 = r1.notna() & r2.notna()
m if m.sum() > 1:
= pearsonr(r1[m], r2[m])
r_pc, p_pc print(f'Partial correlation (Vitamin_D ~ Time | Group dummies): r={r_pc:.3f}, p={p_pc:.2e}, n={m.sum()}')
else:
print('Not enough paired residuals to compute partial correlation.')
else:
print('Need Vitamin_D, Time, and Group for partial correlation.')
7) Non-linear Association
Linear correlations miss curved/heteroscedastic relationships. Two widely used measures that capture general dependence are: - Mutual Information (MI) — information shared between variables (0 = independent). - Distance Correlation (dCor) — 0 if and only if independent (for finite variance), sensitive to many forms of nonlinearity.
We compute MI with scikit-learn (k-NN estimator) and implement a small, self-contained dCor. We also add permutation p-values for both.
from sklearn.utils import check_random_state
def mutual_info_xy(x, y, n_neighbors=3, seed=42):
"""Mutual information (continuous-continuous) using k-NN estimator (scikit-learn)."""
= pd.Series(x).astype(float)
x = pd.Series(y).astype(float)
y = x.notna() & y.notna()
m = x[m].to_numpy().reshape(-1,1)
X = y[m].to_numpy()
Y if len(X) < 5:
return np.nan
= mutual_info_regression(X, Y, n_neighbors=n_neighbors, random_state=check_random_state(seed))
mi return float(mi[0])
def _double_center(D):
"""Double-center a distance matrix."""
= D.shape[0]
n = D.mean(axis=1, keepdims=True)
row_mean = D.mean(axis=0, keepdims=True)
col_mean = D.mean()
grand return D - row_mean - col_mean + grand
def distance_correlation(x, y):
"""Distance correlation (biased version, adequate for EDA)."""
= pd.Series(x).astype(float)
x = pd.Series(y).astype(float)
y = x.notna() & y.notna()
m = x[m].to_numpy()
x = y[m].to_numpy()
y if x.size < 2:
return np.nan
= np.abs(x[:,None] - x[None,:])
X = np.abs(y[:,None] - y[None,:])
Y = _double_center(X)
A = _double_center(Y)
B = (A * B).mean()
dcov2 = (A * A).mean()
dvarx = (B * B).mean()
dvary if dvarx <= 0 or dvary <= 0:
return 0.0
return float(np.sqrt(max(dcov2, 0)) / np.sqrt(np.sqrt(dvarx * dvary)))
def perm_pvalue_stat(x, y, stat_fn, B=2000, seed=123):
= np.random.default_rng(seed)
rng = pd.Series(x).astype(float)
x = pd.Series(y).astype(float)
y = x.notna() & y.notna()
m = x[m].to_numpy()
x = y[m].to_numpy()
y if len(x) < 5:
return np.nan, np.nan
= stat_fn(x, y)
s_obs = 0
cnt for b in range(B):
= rng.permutation(y)
y_perm = stat_fn(x, y_perm)
s_b if s_b >= s_obs: # right-tailed (both MI and dCor are >= 0)
+= 1
cnt = (cnt + 1) / (B + 1)
pval return s_obs, pval
if has_vit and has_time:
= perm_pvalue_stat(df['Vitamin_D'], df['Time'],
mi, p_mi lambda a,b: mutual_info_xy(a,b,n_neighbors=5, seed=1),
=1000, seed=2)
B= perm_pvalue_stat(df['Vitamin_D'], df['Time'], distance_correlation, B=1000, seed=3)
dcor, p_dc print(f'Mutual Information (kNN): MI={mi:.4f}, perm p={p_mi:.4f}')
print(f'Distance Correlation: dCor={dcor:.4f}, perm p={p_dc:.4f}')
else:
print('Need Vitamin_D and numeric Time for non-linear measures.')
8) (Optional) Tiny correlation heatmap
Use 4.2 for full heatmaps/pairplots. Here’s a tiny demo with numeric columns (plus log(Vitamin_D)
if present).
= df.select_dtypes(include=[np.number]).copy()
num2 if 'Vitamin_D' in num2.columns:
'Vitamin_D_log'] = np.log(num2['Vitamin_D'] + 1e-6)
num2[if num2.shape[1] >= 2:
= num2.corr(numeric_only=True, method='pearson')
corr_pear =(5,4))
plt.figure(figsize=True, fmt='.2f', vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_pear, annot'Pearson correlations (tiny demo)')
plt.title(; plt.show()
plt.tight_layout()else:
print('Not enough numeric variables for a small heatmap. See 4.2 for richer visuals.')
🧪 Exercises
- Pearson vs Spearman vs Kendall
- Pick two numeric variables (e.g.,
Vitamin_D
andTime
).
- Compute all three correlations. Which is largest? Are p-values consistent?
- Pick two numeric variables (e.g.,
- Point-biserial
- If you have a binary subgroup (e.g., a dichotomised
Outcome
), compute the point-biserial withVitamin_D
.
- Interpret the sign (which group tends to be higher?).
- If you have a binary subgroup (e.g., a dichotomised
- Bootstrap CI + Permutation test
- Build a bootstrap CI for Pearson r between two variables of your choice.
- Run a permutation test; compare p-values with
pearsonr
’s parametric p-value.
- Build a bootstrap CI for Pearson r between two variables of your choice.
- Partial correlation
- Control for
Group
when relatingVitamin_D
andTime
.
- Does controlling for Group change the strength noticeably?
- Control for
- Non-linear
- Compute MI and dCor for
Vitamin_D
vsTime
(or another numeric pair).
- Use permutation p-values; compare to Pearson/Spearman conclusions.
- Compute MI and dCor for
🔗 Need distribution checks and log reasoning? See 4.1.
🔗 Need broad visual overviews (pair plots/heatmaps)? See 4.2.
✅ Conclusion
You quantified associations via Pearson/Spearman/Kendall, handled binary–numeric with point-biserial, and added uncertainty (bootstrap CI), a non-parametric significance check (permutation), a simple partial correlation, and nonlinear association metrics (MI, dCor). You’re ready to proceed to inferential comparisons and models.
Further reading
- SciPy stats: https://docs.scipy.org/doc/scipy/reference/stats.html
- Statsmodels regression & diagnostics: https://www.statsmodels.org/
- scikit-learn mutual information: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html
- Distance correlation (Szekely & Rizzo): overview articles/tutorials