# Setup for Google Colab: clone the repo so we can read data
import os
from google.colab import files
= '05_advanced'
MODULE = 'large_food_log.csv'
DATASET = '/content/data-analysis-projects'
BASE_PATH = os.path.join(BASE_PATH, 'notebooks', MODULE)
MODULE_PATH = os.path.join(MODULE_PATH, 'data', DATASET)
DATASET_PATH
try:
print('Attempting to clone repository...')
if not os.path.exists(BASE_PATH):
!git clone https://github.com/ggkuhnle/data-analysis-projects.git
os.chdir(MODULE_PATH)if os.path.exists(DATASET_PATH):
print(f'Dataset found: {DATASET_PATH} β
')
else:
raise FileNotFoundError('Dataset missing after clone.')
except Exception as e:
print(f'Cloning failed: {e}')
print('Falling back to manual upload...')
'data', exist_ok=True)
os.makedirs(= files.upload()
uploaded if DATASET in uploaded:
with open(os.path.join('data', DATASET), 'wb') as f:
f.write(uploaded[DATASET])print(f'Successfully uploaded {DATASET} β
')
else:
raise FileNotFoundError(f'Upload {DATASET} to continue.')
ποΈ 5.4 Databases and SQL
In this notebook youβll load a nutrition dataset into SQLite, write SQL to summarise and join tables, and round it off with window functions and views.
You will: - Create a SQLite DB and load large_food_log.csv
. - Define a clean schema (types, constraints) and add indexes. - Write core SQL (filter, group, aggregate), joins, and window functions. - Use parameterised queries from Python safely. - Build views for reusable analytics.
Why SQL for nutrition?
Food-logging datasets can be large and tidy. SQL lets you filter, aggregate, and join efficiently, then hand clean subsets to pandas or modelling code.%pip install -q pandas
import pandas as pd
import numpy as np
import sqlite3 # stdlib β no pip install needed
from pathlib import Path
'display.max_columns', 40)
pd.set_option(print('SQL environment ready.')
π₯ Load CSV and Inspect
Weβll parse Date
as a proper date and peek at a few rows.
= Path('data') / 'large_food_log.csv'
csv_path = pd.read_csv(csv_path)
df
# Try to parse date if present
if 'Date' in df.columns:
'Date'] = pd.to_datetime(df['Date'], errors='coerce')
df[
display(df.head())print('\nDtypes:')
print(df.dtypes)
π§± Create SQLite DB + Clean Schema
Weβll create a typed table instead of relying on default to_sql
types. This is more explicit and robust.
= Path('nutrition.db')
db_path if db_path.exists():
# start fresh for reproducibility
db_path.unlink()
= sqlite3.connect(db_path)
conn = conn.cursor()
cur
# Drop if exists & create schema with constraints
cur.executescript("""
DROP TABLE IF EXISTS food_log;
CREATE TABLE food_log (
ID TEXT NOT NULL,
Meal TEXT NOT NULL,
Nutrient TEXT NOT NULL,
Amount REAL NOT NULL,
Date TEXT, -- ISO8601 string (YYYY-MM-DD)
CHECK (Amount >= 0)
);
"""
)
conn.commit()
# Insert rows using executemany for speed and explicit typing
= []
records for _, row in df.iterrows():
= None
date_str if 'Date' in df.columns and pd.notna(row['Date']):
# store as ISO string for SQLite (TEXT)
= pd.to_datetime(row['Date']).date().isoformat()
date_str str(row.get('ID','')), str(row.get('Meal','')),
records.append((str(row.get('Nutrient','')), float(row.get('Amount',0.0)), date_str))
cur.executemany("INSERT INTO food_log (ID, Meal, Nutrient, Amount, Date) VALUES (?, ?, ?, ?, ?)",
records
)
conn.commit()
# Indexes for speed (filtering/grouper columns)
cur.executescript("""
CREATE INDEX IF NOT EXISTS idx_food_log_meal ON food_log(Meal);
CREATE INDEX IF NOT EXISTS idx_food_log_nutrient ON food_log(Nutrient);
CREATE INDEX IF NOT EXISTS idx_food_log_date ON food_log(Date);
"""
)
conn.commit()print('Database ready:', db_path.resolve())
π Core SQL Queries
Aggregate by Meal Γ Nutrient, then limit to the top few rows for display.
= """
q SELECT Meal, Nutrient,
COUNT(*) AS n,
ROUND(AVG(Amount), 2) AS avg_amount,
ROUND(SUM(Amount), 2) AS total_amount
FROM food_log
GROUP BY Meal, Nutrient
ORDER BY Meal, Nutrient
LIMIT 10;
"""
pd.read_sql_query(q, conn)
π Working with Dates
SQLite stores dates as TEXT here; we can still filter by ISO strings, or cast to date with DATE()
where needed.
= """
q SELECT Date, Nutrient, ROUND(SUM(Amount),2) AS total_amount
FROM food_log
WHERE Date BETWEEN '2024-01-01' AND '2024-03-31'
GROUP BY Date, Nutrient
ORDER BY Date ASC, Nutrient ASC
LIMIT 12;
"""
pd.read_sql_query(q, conn)
π Joins (dim tables)
Letβs fabricate a tiny dimension table (nutrient units) and join it to food_log
.
cur.executescript("""
DROP TABLE IF EXISTS dim_nutrient;
CREATE TABLE dim_nutrient (
Nutrient TEXT PRIMARY KEY,
Unit TEXT NOT NULL
);
INSERT INTO dim_nutrient (Nutrient, Unit) VALUES
('Iron','mg'),('Calcium','mg'),('Vitamin_D','Β΅g'),('Protein','g');
"""
)
conn.commit()
= """
q SELECT f.Meal, f.Nutrient, d.Unit,
ROUND(AVG(f.Amount),2) AS avg_amount
FROM food_log f
LEFT JOIN dim_nutrient d USING(Nutrient)
GROUP BY f.Meal, f.Nutrient
ORDER BY f.Meal, f.Nutrient
LIMIT 12;
"""
pd.read_sql_query(q, conn)
πͺ Window Functions
Rank daily totals per nutrient, and compute a rolling-like moving average by partition. SQLite supports window functions (3.25+).
= """
q WITH daily AS (
SELECT Date, Nutrient, SUM(Amount) AS total_amount
FROM food_log
WHERE Date IS NOT NULL
GROUP BY Date, Nutrient
)
SELECT *,
RANK() OVER (PARTITION BY Nutrient ORDER BY total_amount DESC) AS rank_in_nutrient,
ROUND(AVG(total_amount) OVER (
PARTITION BY Nutrient ORDER BY Date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
), 2) AS moving_avg_3
FROM daily
ORDER BY Nutrient, Date
LIMIT 20;
"""
pd.read_sql_query(q, conn)
π Parameterised Queries (safe!)
Never build SQL strings with user input. Use parameters to avoid SQL injection and parsing bugs.
= 'Protein'
nutrient = '2024-01-01', '2024-01-15'
start, end = (
q "SELECT Date, SUM(Amount) AS total_amount "
"FROM food_log WHERE Nutrient = ? AND Date BETWEEN ? AND ? "
"GROUP BY Date ORDER BY Date"
)=(nutrient, start, end)) pd.read_sql_query(q, conn, params
π Views (Reusable SQL)
Create a view for daily nutrient totals, then query it like a table.
cur.executescript("""
DROP VIEW IF EXISTS vw_daily_nutrient_totals;
CREATE VIEW vw_daily_nutrient_totals AS
SELECT Date, Nutrient, SUM(Amount) AS total_amount
FROM food_log
WHERE Date IS NOT NULL
GROUP BY Date, Nutrient;
"""
)
conn.commit()
"SELECT * FROM vw_daily_nutrient_totals ORDER BY Date, Nutrient LIMIT 10;", conn) pd.read_sql_query(
π§ͺ Sanity Checks
Quick PRAGMA to confirm schema and indices; and a row count for the main table.
Show PRAGMA
UsePRAGMA table_info(table)
and PRAGMA index_list(table)
.
print('Schema:')
"PRAGMA table_info(food_log);", conn))
display(pd.read_sql_query(print('\nIndexes:')
"PRAGMA index_list(food_log);", conn))
display(pd.read_sql_query(print('\nRow count:')
"SELECT COUNT(*) AS n FROM food_log;", conn)) display(pd.read_sql_query(
π§© Exercises
Protein by Date
Write SQL to compute totalAmount
for Protein perDate
and show the top 10 dates by total.Meal Mix
For eachMeal
, compute the share of eachNutrient
(nutrient total / meal total) on that date. (Hint: use a CTE with totals then join.)Rolling 7-day
Invw_daily_nutrient_totals
, compute a 7-day moving average oftotal_amount
perNutrient
with a window function.Parameterised
From Python, write a parameterised query to get Calcium totals between two dates supplied as variables.
β Conclusion
You created a SQLite database, defined a clean schema with indexes, and wrote SQL to summarise, join, and rank nutrition data. You also built views and parameterised queries to keep your analyses safe and reusable.
Further reading
- SQLite docs: https://www.sqlite.org/docs.html
- SQLite window functions: https://www.sqlite.org/windowfunctions.html
- pandas + SQL: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html