import os
from google.colab import files
= '06_qualitative'
MODULE = 'food_preferences.txt'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join(MODULE_PATH, 'data', DATASET)
DATASET_PATH
try:
if not os.path.exists(BASE_PATH):
print('Cloning repository...')
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
os.chdir(MODULE_PATH)if not os.path.exists(DATASET_PATH):
raise FileNotFoundError('Dataset missing after clone.')
print('Dataset ready ✅')
except Exception as e:
print('Setup fallback: upload file...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(os.path.join('data', DATASET), 'wb') as f:
f.write(uploaded[DATASET])print('Uploaded dataset ✅')
else:
raise FileNotFoundError('Upload food_preferences.txt to continue.')
🎛️ 6.3 Thematic Coding & Reliability
Build a transparent coding workflow: define a codebook, code excerpts, review co-occurrence patterns, and (optionally) compute inter-coder reliability.
We’ll start from the qual_coding_sheet.csv
exported in 6.2, but you can also paste your own.
🎯 Objectives
- Create and iterate a codebook (labels, definitions, examples).
- Populate codes for each response (single or multi-label).
- Summarise code frequencies and co-occurrence.
- Calculate Cohen’s κ for two coders on a subset.
- Map codes → themes and export a brief thematic summary.
%pip install -q pandas numpy scikit-learn seaborn matplotlib networkx
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, networkx as nx
from sklearn.metrics import cohen_kappa_score
sns.set_theme()print('Environment ready.')
📥 Load coding sheet
If you haven’t created one, run 6.2 to export qual_coding_sheet.csv
.
from pathlib import Path
= Path('qual_coding_sheet.csv')
sheet if not sheet.exists():
# Minimal fallback: create from raw file
= Path('data')/'food_preferences.txt'
raw = [r.strip() for r in raw.read_text(encoding='utf-8').splitlines() if r.strip()]
responses = pd.DataFrame({'response_id': range(1, len(responses)+1), 'text': responses})
df 'initial_code'] = ''; df['notes'] = ''
df[=False)
df.to_csv(sheet, index= pd.read_csv(sheet)
df 6) df.head(
📚 Codebook (living document)
Start small, iterate. Keep labels short; include inclusion/exclusion rules and examples.
= pd.DataFrame({
codebook 'code': ['Preference:Fruit','Preference:Carrot','Preference:Grass','Texture:Crisp','Taste:Sweet','Barrier:Access','Context:Social'],
'definition': [
'Expressed liking for fruit (any type)',
'Specific mention of carrots as preferred',
'Preference or mention of grass as staple',
'Mentions of crisp/crunchy texture as desirable',
'Mentions of sweetness as desirable',
'Mentions of access/availability/cost barriers',
'Mentions of others/social influence (family/herd)'
],'example': [
'“Fruit is preferred on hot days.”',
'“I enjoy crunchy carrots.”',
'“Grass is acceptable…”',
'“I like crunchy snacks.”',
'“Sweet foods are better.”',
'“Hard to find fresh produce.”',
'“Friends influence what I eat.”'
]
}) codebook
🏷️ Apply codes (single or multiple)
For quick demos, we’ll auto-suggest codes via keyword rules, then you can edit manually. In practice, codes should be applied by trained coders reading each excerpt.
Auto-suggest rules (click)
- If text contains fruit →
Preference:Fruit
- carrot →
Preference:Carrot
andTexture:Crisp
if crunchy present - grass →
Preference:Grass
- crunchy|crisp →
Texture:Crisp
- sweet →
Taste:Sweet
- expensive|access|hard to find|cost →
Barrier:Access
- friend|family|herd|group →
Context:Social
import re
def suggest_codes(text:str):
= text.lower()
t = []
codes if 'fruit' in t: codes.append('Preference:Fruit')
if 'carrot' in t: codes.append('Preference:Carrot')
if 'grass' in t: codes.append('Preference:Grass')
if re.search(r'crunchy|crisp', t): codes.append('Texture:Crisp')
if 'sweet' in t: codes.append('Taste:Sweet')
if re.search(r'expensive|access|hard to find|cost', t): codes.append('Barrier:Access')
if re.search(r'friend|family|herd|group', t): codes.append('Context:Social')
return sorted(set(codes))
'codes'] = df['text'].apply(suggest_codes)
df['response_id','text','codes']].head(8) df[[
✍️ Manual editing
Export to CSV, edit in Sheets/Excel (add/remove codes, add theme
column if you like), re-import to continue.
'qual_coded_autosuggest.csv', index=False)
df.to_csv(print('Wrote qual_coded_autosuggest.csv — edit if desired and re-load.')
📊 Code frequencies & co-occurrence
from itertools import combinations
def explode_codes(df):
= df[['response_id','codes']].explode('codes').dropna()
dd return dd
= explode_codes(df)
exploded = exploded['codes'].value_counts().rename_axis('code').reset_index(name='count')
freq 10))
display(freq.head(
# Co-occurrence matrix
= []
pairs for _, row in df.iterrows():
= sorted(set(row['codes']))
cs for a,b in combinations(cs, 2):
pairs.append((a,b))= pd.DataFrame(pairs, columns=['code_a','code_b']).value_counts().reset_index(name='n')
co 10) co.head(
# Build symmetric matrix for heatmap
= sorted(freq['code'].tolist())
codes = pd.DataFrame(0, index=codes, columns=codes, dtype=int)
mat for _, r in co.iterrows():
'code_a'], r['code_b']] += r['n']
mat.loc[r['code_b'], r['code_a']] += r['n']
mat.loc[r[=(7,6))
plt.figure(figsize=False, cmap='Blues')
sns.heatmap(mat, annot'Code co-occurrence'); plt.show() plt.title(
🕸️ Optional: co-occurrence network
Edges weighted by co-occurrence counts (thicker = stronger).
# Ensure expected columns exist
assert {'code','count'}.issubset(freq.columns), "freq must have ['code','count']"
assert {'code_a','code_b','n'}.issubset(co.columns), "co must have ['code_a','code_b','n']"
# Codes universe (in case 'codes' isn't defined elsewhere)
= sorted(
codes set(freq['code']).union(co['code_a']).union(co['code_b'])
)
# Map of code -> frequency (default 1 if unseen)
= freq.groupby('code', as_index=True)['count'].sum().to_dict()
freq_map
= nx.Graph()
G
# Add nodes with sizes
for c in codes:
= int(freq_map.get(c, 1)) # avoid FutureWarning by extracting scalar
size_val =size_val)
G.add_node(c, size
# Add edges (merge duplicates, skip self-edges)
for a, b, w in co[['code_a','code_b','n']].itertuples(index=False, name=None):
if a == b:
continue
= int(w) if np.isfinite(w) else 0
w if w <= 0:
continue
if G.has_edge(a, b):
'weight'] += w
G[a][b][else:
=w)
G.add_edge(a, b, weight
# --- Visual scaling ---
# Node sizes: sqrt scaling for readability
= np.array([G.nodes[n]['size'] for n in G.nodes], dtype=float)
node_sizes_raw = 200 * (1.0 + np.sqrt(node_sizes_raw)) # tweak 200 to taste
node_sizes
# Edge widths: min–max scaled to ~0.5–3.0
= np.array([G.edges[e]['weight'] for e in G.edges], dtype=float)
edge_weights_raw if edge_weights_raw.size:
= np.ptp(edge_weights_raw) or 1.0 # <-- use np.ptp instead of .ptp()
span = 0.5 + 2.5 * (edge_weights_raw - edge_weights_raw.min()) / span
edge_widths else:
= []
edge_widths
# Layout + draw
= nx.spring_layout(G, seed=2) # deterministic
pos =(7, 6))
plt.figure(figsize=node_sizes)
nx.draw_networkx_nodes(G, pos, node_size=edge_widths, alpha=0.6)
nx.draw_networkx_edges(G, pos, width=9)
nx.draw_networkx_labels(G, pos, font_size'Code co-occurrence network')
plt.title('off')
plt.axis( plt.show()
🤝 Inter-coder reliability (Cohen’s κ)
For a subset of responses, two coders independently assign a single dominant code. We’ll demo κ; use it judiciously (it fits some designs better than others—e.g., structured codebooks).
# Simulate coder labels for a subset (replace with your real labels)
= df.sample(min(30, len(df)), random_state=1).copy()
subset
def dominant_code(codes_list):
return codes_list[0] if isinstance(codes_list, list) and len(codes_list)>0 else 'None'
# Coder A uses first auto-suggest; coder B mimics with noise
'coderA'] = subset['codes'].apply(dominant_code)
subset[1)
np.random.seed(def jitter(label):
if np.random.rand()<0.15: return 'None'
return label
'coderB'] = subset['coderA'].apply(jitter)
subset[
= cohen_kappa_score(subset['coderA'], subset['coderB'])
kappa print('Cohen\'s κ (demo):', round(kappa, 3))
'response_id','text','coderA','coderB']].head(8) subset[[
🧩 Codes → Themes
Group codes into broader themes. Keep a table mapping to justify boundaries. This is where you explain patterns with excerpt evidence.
= {
theme_map 'Preference:Fruit': 'Taste & Freshness',
'Preference:Carrot': 'Taste & Freshness',
'Preference:Grass': 'Habit & Staple Foods',
'Texture:Crisp': 'Sensory Qualities',
'Taste:Sweet': 'Sensory Qualities',
'Barrier:Access': 'Access & Environment',
'Context:Social': 'Social Influence'
}
= df[['response_id','text','codes']].explode('codes').dropna()
exploded 'theme'] = exploded['codes'].map(theme_map).fillna('Other')
exploded[= exploded.groupby('theme')['response_id'].nunique().sort_values(ascending=False).reset_index(name='n_responses')
theme_counts
display(theme_counts)
=(7,4))
plt.figure(figsize=theme_counts, x='n_responses', y='theme')
sns.barplot(data'Theme coverage (responses with ≥1 code in theme)')
plt.title('Responses'); plt.ylabel('Theme'); plt.tight_layout(); plt.show() plt.xlabel(
📤 Export thematic summary pack
qual_coded_autosuggest.csv
(or your edited file) — coded excerptscodebook.csv
— code labels/definitionstheme_counts.csv
— quick coverage table
Use these to write your results with excerpts that illustrate each theme.
'codebook.csv', index=False)
codebook.to_csv('theme_counts.csv', index=False)
theme_counts.to_csv(print('Wrote codebook.csv and theme_counts.csv')
🧩 Exercises
- Refine the codebook: add 1–2 exclusion rules per code; add 1 more code.
- Dual coding: have a second coder label 20 responses; compute κ on your dominant codes.
- Theme memo: write 3–5 lines defining each theme + 1 illustrative quote (anonymised).
✅ Conclusion
You built a transparent qualitative pipeline: codebook → coding → co-occurrence → reliability (optional) → themes. This supports credible, well-documented qualitative findings that complement your quantitative work.
Further reading
- Coding manuals and reflexive thematic analysis guides
- Reporting standards for qualitative research