import os
from google.colab import files
= '06_qualitative'
MODULE = 'food_preferences.txt'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join(MODULE_PATH, 'data', DATASET)
DATASET_PATH
try:
if not os.path.exists(BASE_PATH):
print('Cloning repository...')
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
os.chdir(MODULE_PATH)if not os.path.exists(DATASET_PATH):
raise FileNotFoundError('Dataset missing after clone.')
print('Dataset ready âś…')
except Exception as e:
print('Setup fallback: upload file...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(os.path.join('data', DATASET), 'wb') as f:
f.write(uploaded[DATASET])print('Uploaded dataset âś…')
else:
raise FileNotFoundError('Upload food_preferences.txt to continue.')
📝 6.2 Text Analysis for Qualitative Research
We’ll prepare text for human-led coding (cleaning, tokenisation, light structure) and add small helper summaries (frequencies, n-grams) that support—not replace—interpretation.
This notebook keeps the qualitative lens front-and-centre while giving you just enough NLP to work efficiently.
🎯 Objectives
- Clean and tokenise open-ended responses with NLTK.
- Lemmatise, remove stopwords/punctuation, handle case.
- Build n-grams (bigrams, trigrams) to surface phrases.
- Optional: POS tags and (careful) sentiment as exploratory aides.
- Export a tidy table ready for manual coding or 6.3.
%pip install -q pandas nltk matplotlib seaborn scikit-learn wordcloud
import pandas as pd
import nltk
# Handle tokenizers/taggers across NLTK versions
for pkg in [
"punkt", "punkt_tab",
"stopwords", "wordnet",
"averaged_perceptron_tagger_eng", "averaged_perceptron_tagger",
"vader_lexicon"
]:try:
=True)
nltk.download(pkg, quietexcept Exception:
pass
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from pathlib import Path
= Path('data')/'food_preferences.txt'
txt = [r.strip() for r in txt.read_text(encoding='utf-8').splitlines() if r.strip()]
responses = pd.DataFrame({'response_id': range(1, len(responses)+1), 'text': responses})
df print('N responses:', len(df))
5) df.head(
# Then define stoplist + lemmatiser
= set(stopwords.words('english')).union({'hippo', 'h1','h2','h3'})
stop = WordNetLemmatizer()
lem
def clean_tokens(text: str):
= word_tokenize(text.lower())
words = [w for w in words if w.isalpha()] # drop punctuation/numbers
words = [w for w in words if w not in stop]
words = [lem.lemmatize(w) for w in words]
words return words
'tokens'] = df['text'].apply(clean_tokens)
df['response_id','text','tokens']].head(6)
df[[
print("Text/NLP environment ready.")
đź§Ľ Preprocessing pipeline
We’ll lowercase, tokenise, remove stopwords/punctuation, and lemmatise (carrots→carrot). This supports coding by removing noise.
= set(stopwords.words('english')).union({'hippo', 'h1','h2','h3'})
stop = WordNetLemmatizer()
lem
def clean_tokens(text: str):
= word_tokenize(text.lower())
words = [w for w in words if w.isalpha()] # drop punctuation/numbers
words = [w for w in words if w not in stop]
words = [lem.lemmatize(w) for w in words]
words return words
'tokens'] = df['text'].apply(clean_tokens)
df['response_id','text','tokens']].head(6) df[[
📊 Frequencies & word cloud (orientation only)
= [t for row in df['tokens'] for t in row]
all_tokens = Counter(all_tokens).most_common(15)
freq =['word','count']) pd.DataFrame(freq, columns
= WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_tokens))
wc =(10,4)); plt.imshow(wc); plt.axis('off'); plt.title('Word Cloud'); plt.show() plt.figure(figsize
đź”— N-grams (bigrams & trigrams)
Short phrases can reveal food pairings (e.g., fresh fruit, crunchy carrot).
from nltk.util import ngrams
def ngram_counts(tokens_list, n=2, top=15):
= Counter()
ng for toks in tokens_list:
ng.update(ngrams(toks, n))return pd.DataFrame(ng.most_common(top), columns=[f'{n}-gram','count'])
= ngram_counts(df['tokens'], n=2, top=15)
bigrams = ngram_counts(df['tokens'], n=3, top=10)
trigrams ; display(trigrams) display(bigrams)
📤 Export a coding-ready table
We create a simple structure that supports manual coding (e.g., in Excel/Sheets or in 6.3).
= df[['response_id','text','tokens']].copy()
out 'initial_code'] = '' # analyst will fill codes
out['notes'] = '' # memo/comments
out[= 'qual_coding_sheet.csv'
out_path =False)
out.to_csv(out_path, indexprint('Wrote:', out_path)
đź§© Exercises
- Stoplist tuning: add domain-specific stopwords (e.g., like, really, very)—how do top words change?
- Phrase mining: examine bigrams containing fruit or carrot; collect example quotes.
- Coding sheet: add 2–4 provisional initial codes per 10 responses (keep them short & action-oriented).
âś… Conclusion
You prepared text for analysis and produced a coding-ready table. Next: formal coding & thematic analysis with reliability checks (6.3).
More
- NLTK docs (tokenisation, POS, stopwords)
- Practical theming workflows