timeseries_visualization / scripts /download_fred_data.py
fmegahed's picture
Cleaning up the app
789e257
"""Download real FRED datasets for the time-series visualization app.
Uses only ``urllib.request`` (stdlib) — no extra dependencies.
Series
------
* OHUR — Ohio Unemployment Rate (%, Monthly, 1976–present)
* OHMFG — Ohio Manufacturing Employment (1000s, SA, Monthly)
* INMFG — Indiana Manufacturing Employment
* MIMFG — Michigan Manufacturing Employment
* TXMFG — Texas Manufacturing Employment
* CAMFG — California Manufacturing Employment
Run once locally, then commit the resulting CSVs::
python scripts/download_fred_data.py
"""
from __future__ import annotations
import io
import urllib.request
from pathlib import Path
import pandas as pd
# Resolve paths relative to the project root (parent of scripts/)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)
_FRED_CSV_URL = "https://fred.stlouisfed.org/graph/fredgraph.csv?id={series_id}"
def _fetch_fred(series_id: str) -> pd.DataFrame:
"""Download a single FRED series and return a two-column DataFrame."""
url = _FRED_CSV_URL.format(series_id=series_id)
print(f" Downloading {series_id} …")
with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310
raw = resp.read().decode("utf-8")
df = pd.read_csv(io.StringIO(raw))
# FRED uses "." for missing values — coerce to NaN
df.columns = ["date", series_id]
df[series_id] = pd.to_numeric(df[series_id], errors="coerce")
df["date"] = pd.to_datetime(df["date"])
df = df.dropna()
return df
# ---------------------------------------------------------------------------
# 1. Single series: Ohio Unemployment Rate
# ---------------------------------------------------------------------------
def build_ohio_unemployment() -> pd.DataFrame:
df = _fetch_fred("OHUR")
df = df.rename(columns={"OHUR": "unemployment_rate"})
return df
# ---------------------------------------------------------------------------
# 2. Multi-series wide: Manufacturing Employment by State
# ---------------------------------------------------------------------------
_MFG_SERIES = {
"OHMFG": "Ohio",
"INMFG": "Indiana",
"MIMFG": "Michigan",
"TXMFG": "Texas",
"CAMFG": "California",
}
def build_manufacturing_wide() -> pd.DataFrame:
frames = []
for sid, state_name in _MFG_SERIES.items():
df = _fetch_fred(sid)
df = df.rename(columns={sid: state_name})
frames.append(df)
# Inner-join on date so all states share the same date range
wide = frames[0]
for f in frames[1:]:
wide = wide.merge(f, on="date", how="inner")
wide = wide.sort_values("date").reset_index(drop=True)
return wide
# ---------------------------------------------------------------------------
# 3. Multi-series long: same data melted
# ---------------------------------------------------------------------------
def build_manufacturing_long(wide: pd.DataFrame) -> pd.DataFrame:
long = wide.melt(
id_vars="date",
var_name="state",
value_name="manufacturing_employment",
)
long = long.sort_values(["date", "state"]).reset_index(drop=True)
return long
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
print("=== Downloading FRED data ===\n")
# 1. Ohio Unemployment
single = build_ohio_unemployment()
out = DATA_DIR / "demo_ohio_unemployment.csv"
single.to_csv(out, index=False)
print(f" -> {len(single)} rows => {out}\n")
# 2. Manufacturing wide
wide = build_manufacturing_wide()
out = DATA_DIR / "demo_manufacturing_wide.csv"
wide.to_csv(out, index=False)
print(f" -> {len(wide)} rows => {out}\n")
# 3. Manufacturing long
long = build_manufacturing_long(wide)
out = DATA_DIR / "demo_manufacturing_long.csv"
long.to_csv(out, index=False)
print(f" -> {len(long)} rows => {out}\n")
print("Done.")
if __name__ == "__main__":
main()