Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions
--- a/scripts/ingest_legiscan.py
+++ b/scripts/ingest_legiscan.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python3
+"""
+Ingest LegiScan legislative datasets for all US states (2016-2026) into PostgreSQL.
+
+Fetches all state session datasets from the LegiScan API, parses bill JSONs from
+each ZIP archive, and loads them into the data_centers PostgreSQL database. Bills are
+tagged with relevance categories (data_center, large_load, ratepayer_protection, etc.).
+
+Usage:
+    python ingest_legiscan.py [--all | --setup-db | --fetch | --load | --tag]
+                              [--state XX] [--year-start YYYY] [--dry-run] [--verbose]
+
+Options:
+    --all           Run all phases in sequence
+    --setup-db      Create/update database tables and indexes
+    --fetch         Download dataset ZIPs for all states (uses hash caching)
+    --load          Parse cached ZIPs and insert/update bills in DB
+    --tag           (Re)apply relevance tagging to all loaded bills
+    --state XX      Restrict to one state (e.g., CA)
+    --year-start N  Earliest session year to include (default: 2016)
+    --dry-run       Print what would be done; no API calls or DB writes
+    --verbose       Extra progress output
+
+Environment:
+    LEGISCAN_API_KEY                  Required
+    PGWEB_HOST, PGWEB_PORT,
+    PGWEB_USER, PGWEB_PASSWORD        PostgreSQL connection (DB: data_centers)
+"""
+
+import argparse
+import base64
+import io
+import json
+import logging
+import os
+import sys
+import time
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+DB_NAME = "data_centers"
+API_KEY = os.environ.get("LEGISCAN_API_KEY")
+API_BASE = "https://api.legiscan.com/"
+CACHE_DIR = Path("data/legiscan_cache")
+MIN_YEAR_DEFAULT = 2016
+RATE_LIMIT_DELAY = 0.5  # seconds between API calls
+
+# Keyword categories for relevance tagging.
+# Keys become the tag values stored in legiscan_bills.relevance_tags[].
+RELEVANCE_KEYWORDS: dict[str, list[str]] = {
+    "data_center": [
+        "data center", "data centre", "hyperscale", "colocation", "colo facility",
+        "server farm", "cloud computing facility", "internet exchange",
+        "carrier hotel", "artificial intelligence facility", "ai campus",
+        "ai data center", "gpu cluster", "compute facility",
+        "high performance computing", "hpc facility", "data hall",
+        "network access point", "data warehousing facility",
+    ],
+    "large_load": [
+        "large load", "large power consumer", "large electricity consumer",
+        "high electricity consumption", "high power consumption",
+        "megawatt load", "gigawatt load", "cryptocurrency mining",
+        "bitcoin mining", "blockchain mining", "crypto mining",
+        "digital asset mining", "proof of work", "electric arc furnace",
+        "large industrial customer", "high-density load", "new large load",
+        "load growth", "extraordinary load",
+    ],
+    "ratepayer_protection": [
+        "ratepayer", "rate payer", "cost shift", "cost shifting",
+        "cost allocation", "cross-subsidy", "cross subsidy",
+        "rate design", "rate structure", "electricity rate",
+        "electric rate", "utility rate", "rate increase", "rate burden",
+        "rate base", "stranded cost", "rate class", "customer protection",
+        "consumer protection", "electric customer", "residential customer",
+        "demand charge", "transmission cost", "grid upgrade cost",
+        "interconnection cost", "cost recovery", "rate relief",
+        "affordability", "energy burden",
+    ],
+    "grid_impact": [
+        "grid reliability", "grid stability", "grid congestion",
+        "grid modernization", "grid infrastructure", "electric grid",
+        "power grid", "electricity grid", "transmission upgrade",
+        "transmission expansion", "interconnection queue",
+        "interconnection study", "demand response", "curtailment",
+        "grid capacity", "system reliability", "capacity expansion",
+        "electric system", "power system reliability", "grid resilience",
+        "grid planning", "integrated resource plan",
+    ],
+    "water_use": [
+        "water consumption", "cooling water", "water efficiency",
+        "water use effectiveness", "evaporative cooling",
+        "water withdrawal", "water discharge", "water impact",
+        "water footprint", "cooling tower", "water-cooled",
+        "once-through cooling", "recycled water", "water stress",
+        "water scarcity",
+    ],
+    "tax_incentive": [
+        "tax credit", "tax exemption", "tax abatement", "tax incentive",
+        "sales tax exemption", "property tax exemption", "tax break",
+        "tax relief", "enterprise zone", "economic incentive",
+        "business incentive", "investment credit", "job creation credit",
+        "economic development incentive", "opportunity zone",
+        "tax subsidy",
+    ],
+    "energy_policy": [
+        "renewable energy", "clean energy", "energy efficiency",
+        "power purchase agreement", " ppa ", "green tariff",
+        "clean power", "carbon neutral", "net zero", "decarbonization",
+        "energy procurement", "24/7 clean energy", "carbon-free",
+        "clean electricity", "energy storage", "virtual power plant",
+        "net metering", "green power",
+    ],
+    "siting_permitting": [
+        "conditional use permit", "special use permit", "land use permit",
+        "zoning", "facility siting", "environmental review",
+        "environmental impact", "noise ordinance", "setback requirement",
+        "building permit", "construction permit", "site approval",
+        "local approval", "permit requirement", "permitting process",
+        "local control", "preemption",
+    ],
+}
+
+# Status code labels (LegiScan)
+STATUS_LABELS = {
+    0: "N/A", 1: "Introduced", 2: "Engrossed", 3: "Enrolled",
+    4: "Passed", 5: "Vetoed", 6: "Failed", 7: "Override",
+    8: "Chaptered", 9: "Referred", 10: "Report Pass",
+    11: "Report DNP", 12: "Draft",
+}
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Database
+# ---------------------------------------------------------------------------
+
+def get_db_connection():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+DDL = """
+CREATE TABLE IF NOT EXISTS legiscan_sessions (
+    session_id      INTEGER PRIMARY KEY,
+    state_id        INTEGER NOT NULL,
+    state_abbr      VARCHAR(2) NOT NULL,
+    year_start      INTEGER NOT NULL,
+    year_end        INTEGER NOT NULL,
+    session_title   TEXT,
+    session_tag     TEXT,
+    is_special      BOOLEAN DEFAULT FALSE,
+    is_prior        BOOLEAN DEFAULT FALSE,
+    dataset_hash    VARCHAR(32),
+    dataset_date    DATE,
+    dataset_size_mb FLOAT,
+    bill_count      INTEGER DEFAULT 0,
+    imported_at     TIMESTAMPTZ
+);
+
+CREATE TABLE IF NOT EXISTS legiscan_bills (
+    bill_id         INTEGER PRIMARY KEY,
+    session_id      INTEGER REFERENCES legiscan_sessions(session_id),
+    state           VARCHAR(2) NOT NULL,
+    bill_number     VARCHAR(50),
+    bill_type       VARCHAR(10),
+    title           TEXT,
+    description     TEXT,
+    status          INTEGER,
+    status_date     DATE,
+    completed       INTEGER DEFAULT 0,
+    body            VARCHAR(10),
+    url             TEXT,
+    state_link      TEXT,
+    change_hash     VARCHAR(32),
+    subjects        TEXT[],
+    sponsor_count   INTEGER DEFAULT 0,
+    vote_count      INTEGER DEFAULT 0,
+    text_count      INTEGER DEFAULT 0,
+    is_relevant     BOOLEAN DEFAULT FALSE,
+    relevance_tags  TEXT[],
+    imported_at     TIMESTAMPTZ DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_ls_bills_state        ON legiscan_bills(state);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_session      ON legiscan_bills(session_id);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_status       ON legiscan_bills(status);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_relevant     ON legiscan_bills(is_relevant) WHERE is_relevant;
+CREATE INDEX IF NOT EXISTS idx_ls_bills_subjects     ON legiscan_bills USING gin(subjects);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_rtags        ON legiscan_bills USING gin(relevance_tags);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_fts ON legiscan_bills
+    USING gin(to_tsvector('english',
+        COALESCE(title, '') || ' ' || COALESCE(description, '')));
+"""
+
+
+def setup_db(conn):
+    with conn.cursor() as cur:
+        cur.execute(DDL)
+    conn.commit()
+    log.info("Database tables and indexes ready.")
+
+
+# ---------------------------------------------------------------------------
+# LegiScan API helpers
+# ---------------------------------------------------------------------------
+
+def _api_get(params: dict, timeout: int = 120) -> dict:
+    """Make one LegiScan API call and return the parsed JSON."""
+    params["key"] = API_KEY
+    resp = requests.get(API_BASE, params=params, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    if data.get("status") != "OK":
+        raise RuntimeError(f"LegiScan API error: {data}")
+    return data
+
+
+def get_all_dataset_metadata(year_start: int, state_filter: Optional[str] = None) -> list[dict]:
+    """Fetch full dataset list (one API call), filter to year_start+."""
+    log.info("Fetching dataset list from LegiScan…")
+    data = _api_get({"op": "getDatasetList"})
+    sessions = data["datasetlist"]
+    log.info(f"  Total sessions across all states: {len(sessions)}")
+    sessions = [s for s in sessions if s["year_start"] >= year_start]
+    if state_filter:
+        # Need to map state abbr → state_id.  Derive from a quick per-state call.
+        log.info(f"  Filtering to state {state_filter}…")
+        state_data = _api_get({"op": "getDatasetList", "state": state_filter})
+        valid_ids = {s["session_id"] for s in state_data["datasetlist"]}
+        sessions = [s for s in sessions if s["session_id"] in valid_ids]
+    log.info(f"  Sessions matching filters: {len(sessions)}")
+    return sessions
+
+
+def download_dataset_zip(session: dict, dry_run: bool = False) -> tuple[Optional[bytes], bool]:
+    """Download a dataset ZIP via the API; cache to disk.
+    Returns (zip_bytes, api_call_made) — api_call_made is True only when the
+    network was actually hit so the caller can rate-limit appropriately."""
+    session_id = session["session_id"]
+    dataset_hash = session["dataset_hash"]
+    access_key = session["access_key"]
+
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_path = CACHE_DIR / f"{session_id}_{dataset_hash}.zip"
+
+    if cache_path.exists():
+        log.debug(f"  Cache hit: {cache_path.name}")
+        return cache_path.read_bytes(), False
+
+    if dry_run:
+        log.info(f"  [dry-run] Would download session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)")
+        return None, False
+
+    log.info(f"  Downloading session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)…")
+    data = _api_get({"op": "getDataset", "access_key": access_key, "id": session_id})
+    zip_bytes = base64.b64decode(data["dataset"]["zip"])
+    cache_path.write_bytes(zip_bytes)
+    log.info(f"  Cached → {cache_path.name}")
+    return zip_bytes, True
+    return zip_bytes
+
+
+# ---------------------------------------------------------------------------
+# Relevance tagging
+# ---------------------------------------------------------------------------
+
+def score_relevance(title: str, description: str, subjects: list[str]) -> tuple[bool, list[str]]:
+    """Return (is_relevant, list_of_matched_tags)."""
+    haystack = " ".join([
+        (title or "").lower(),
+        (description or "").lower(),
+        " ".join(s.lower() for s in subjects),
+    ])
+    tags = []
+    for tag, keywords in RELEVANCE_KEYWORDS.items():
+        if any(kw in haystack for kw in keywords):
+            tags.append(tag)
+    return bool(tags), tags
+
+
+# ---------------------------------------------------------------------------
+# ZIP processing and DB loading
+# ---------------------------------------------------------------------------
+
+def _state_abbr_from_zip(zf: zipfile.ZipFile) -> str:
+    """Extract the state abbreviation from the ZIP's path structure."""
+    for name in zf.namelist():
+        parts = name.split("/")
+        if len(parts) >= 1 and len(parts[0]) == 2:
+            return parts[0]
+    return "??"
+
+
+def process_dataset(
+    session: dict,
+    zip_bytes: bytes,
+    conn,
+    state_abbr: Optional[str] = None,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> int:
+    """Parse all bill JSONs from a ZIP and upsert into legiscan_bills. Returns count."""
+    session_id = session["session_id"]
+
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+        if not state_abbr:
+            state_abbr = _state_abbr_from_zip(zf)
+        bill_files = [n for n in zf.namelist() if "/bill/" in n and n.endswith(".json")]
+
+        if not bill_files:
+            log.warning(f"  Session {session_id}: no bill files found in ZIP.")
+            return 0
+
+        rows = []
+        for fname in bill_files:
+            try:
+                raw = json.loads(zf.read(fname))
+                b = raw.get("bill", raw)
+            except Exception as e:
+                log.warning(f"  Could not parse {fname}: {e}")
+                continue
+
+            subjects = [s["subject_name"] for s in (b.get("subjects") or []) if s.get("subject_name")]
+            is_rel, tags = score_relevance(
+                b.get("title", ""),
+                b.get("description", ""),
+                subjects,
+            )
+
+            status_date = b.get("status_date") or None
+            rows.append((
+                b["bill_id"],
+                session_id,
+                b.get("state", state_abbr),
+                b.get("bill_number"),
+                b.get("bill_type"),
+                b.get("title"),
+                b.get("description"),
+                b.get("status"),
+                status_date,
+                b.get("completed", 0),
+                b.get("body"),
+                b.get("url"),
+                b.get("state_link"),
+                b.get("change_hash"),
+                subjects or None,
+                len(b.get("sponsors") or []),
+                len(b.get("votes") or []),
+                len(b.get("texts") or []),
+                is_rel,
+                tags or None,
+            ))
+
+    if dry_run:
+        log.info(f"  [dry-run] Session {session_id} ({state_abbr}): would insert/update {len(rows)} bills")
+        return len(rows)
+
+    UPSERT = """
+        INSERT INTO legiscan_bills (
+            bill_id, session_id, state, bill_number, bill_type,
+            title, description, status, status_date, completed,
+            body, url, state_link, change_hash, subjects,
+            sponsor_count, vote_count, text_count,
+            is_relevant, relevance_tags, imported_at
+        ) VALUES %s
+        ON CONFLICT (bill_id) DO UPDATE SET
+            change_hash    = EXCLUDED.change_hash,
+            status         = EXCLUDED.status,
+            status_date    = EXCLUDED.status_date,
+            completed      = EXCLUDED.completed,
+            subjects       = EXCLUDED.subjects,
+            sponsor_count  = EXCLUDED.sponsor_count,
+            vote_count     = EXCLUDED.vote_count,
+            text_count     = EXCLUDED.text_count,
+            is_relevant    = EXCLUDED.is_relevant,
+            relevance_tags = EXCLUDED.relevance_tags,
+            imported_at    = NOW()
+        WHERE legiscan_bills.change_hash IS DISTINCT FROM EXCLUDED.change_hash
+    """
+
+    template = "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())"
+
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_values(cur, UPSERT, rows, template=template, page_size=500)
+        count = cur.rowcount
+
+    # Update session bill_count
+    with conn.cursor() as cur:
+        cur.execute(
+            "UPDATE legiscan_sessions SET bill_count = %s, imported_at = NOW() WHERE session_id = %s",
+            (len(rows), session_id),
+        )
+    conn.commit()
+
+    if verbose:
+        relevant = sum(1 for r in rows if r[18])
+        log.info(f"  Session {session_id} ({state_abbr}): {len(rows)} bills, {relevant} relevant, {count} upserted")
+    return len(rows)
+
+
+def upsert_session(session: dict, state_abbr: str, conn, dry_run: bool = False):
+    """Insert or update a session record."""
+    if dry_run:
+        return
+    with conn.cursor() as cur:
+        cur.execute("""
+            INSERT INTO legiscan_sessions
+                (session_id, state_id, state_abbr, year_start, year_end,
+                 session_title, session_tag, is_special, is_prior,
+                 dataset_hash, dataset_date, dataset_size_mb)
+            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+            ON CONFLICT (session_id) DO UPDATE SET
+                dataset_hash   = EXCLUDED.dataset_hash,
+                dataset_date   = EXCLUDED.dataset_date,
+                dataset_size_mb = EXCLUDED.dataset_size_mb,
+                session_title  = EXCLUDED.session_title
+        """, (
+            session["session_id"],
+            session["state_id"],
+            state_abbr,
+            session["year_start"],
+            session["year_end"],
+            session.get("session_title"),
+            session.get("session_tag"),
+            bool(session.get("special")),
+            bool(session.get("prior")),
+            session.get("dataset_hash"),
+            session.get("dataset_date"),
+            session.get("dataset_size", 0) / 1e6,
+        ))
+    conn.commit()
+
+
+def needs_import(session: dict, conn) -> bool:
+    """Return True if this session's dataset_hash differs from what's in the DB."""
+    with conn.cursor() as cur:
+        cur.execute(
+            "SELECT dataset_hash FROM legiscan_sessions WHERE session_id = %s",
+            (session["session_id"],),
+        )
+        row = cur.fetchone()
+    if row is None:
+        return True
+    return row[0] != session["dataset_hash"]
+
+
+# ---------------------------------------------------------------------------
+# Retag phase
+# ---------------------------------------------------------------------------
+
+def retag_all_bills(conn, dry_run: bool = False, verbose: bool = False):
+    """Re-score relevance for every bill already in the DB."""
+    log.info("Re-tagging all bills…")
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute("SELECT bill_id, title, description, subjects FROM legiscan_bills")
+        rows = cur.fetchall()
+
+    log.info(f"  Scoring {len(rows)} bills…")
+    updates = []
+    for row in rows:
+        is_rel, tags = score_relevance(
+            row["title"] or "",
+            row["description"] or "",
+            row["subjects"] or [],
+        )
+        updates.append((is_rel, tags or None, row["bill_id"]))
+
+    if dry_run:
+        relevant = sum(1 for u in updates if u[0])
+        log.info(f"  [dry-run] Would tag {relevant}/{len(updates)} bills as relevant")
+        return
+
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_values(
+            cur,
+            "UPDATE legiscan_bills SET is_relevant = data.is_rel, relevance_tags = data.tags "
+            "FROM (VALUES %s) AS data(is_rel, tags, bill_id) "
+            "WHERE legiscan_bills.bill_id = data.bill_id::integer",
+            updates,
+            template="(%s, %s::text[], %s)",
+        )
+    conn.commit()
+
+    relevant = sum(1 for u in updates if u[0])
+    log.info(f"  Tagged {relevant}/{len(updates)} bills as relevant.")
+
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+
+def print_summary(conn):
+    queries = {
+        "Total sessions":  "SELECT COUNT(*) FROM legiscan_sessions",
+        "Total bills":     "SELECT COUNT(*) FROM legiscan_bills",
+        "Relevant bills":  "SELECT COUNT(*) FROM legiscan_bills WHERE is_relevant",
+        "States covered":  "SELECT COUNT(DISTINCT state) FROM legiscan_bills",
+    }
+    print("\n--- LegiScan ingestion summary ---")
+    with conn.cursor() as cur:
+        for label, sql in queries.items():
+            cur.execute(sql)
+            print(f"  {label}: {cur.fetchone()[0]:,}")
+
+    # Top relevance tags
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT tag, COUNT(*) AS n
+            FROM legiscan_bills, unnest(relevance_tags) AS tag
+            GROUP BY tag ORDER BY n DESC
+        """)
+        rows = cur.fetchall()
+    if rows:
+        print("\n  Relevant bills by tag:")
+        for tag, n in rows:
+            print(f"    {tag:<30} {n:>6,}")
+
+    # Top states for relevant bills
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT state, COUNT(*) AS n
+            FROM legiscan_bills WHERE is_relevant
+            GROUP BY state ORDER BY n DESC LIMIT 15
+        """)
+        rows = cur.fetchall()
+    if rows:
+        print("\n  Top states by relevant bill count:")
+        for state, n in rows:
+            print(f"    {state}  {n:>5,}")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def parse_args():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--all",       action="store_true", help="Run setup-db + fetch + load + tag")
+    p.add_argument("--setup-db",  action="store_true", help="Create/update DB tables")
+    p.add_argument("--fetch",     action="store_true", help="Download dataset ZIPs")
+    p.add_argument("--load",      action="store_true", help="Load cached ZIPs into DB")
+    p.add_argument("--tag",       action="store_true", help="Retag all bills for relevance")
+    p.add_argument("--state",     default=None, metavar="XX", help="Limit to one state")
+    p.add_argument("--year-start", type=int, default=MIN_YEAR_DEFAULT, dest="year_start")
+    p.add_argument("--dry-run",   action="store_true")
+    p.add_argument("--verbose",   action="store_true")
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+
+    if not API_KEY:
+        log.error("LEGISCAN_API_KEY is not set.")
+        sys.exit(1)
+
+    do_setup = args.all or args.setup_db
+    do_fetch = args.all or args.fetch
+    do_load  = args.all or args.load
+    do_tag   = args.all or args.tag
+
+    if not any([do_setup, do_fetch, do_load, do_tag]):
+        log.error("Specify at least one phase: --all, --setup-db, --fetch, --load, --tag")
+        sys.exit(1)
+
+    conn = None if args.dry_run else get_db_connection()
+
+    # ── Setup ──────────────────────────────────────────────────────────────
+    if do_setup:
+        if args.dry_run:
+            log.info("[dry-run] Would create legiscan_sessions and legiscan_bills tables.")
+        else:
+            setup_db(conn)
+
+    # ── Fetch + Load (interleaved per session for memory efficiency) ────────
+    if do_fetch or do_load:
+        sessions = get_all_dataset_metadata(args.year_start, state_filter=args.state)
+        total = len(sessions)
+        log.info(f"Processing {total} sessions (year_start ≥ {args.year_start})…")
+
+        total_bills = 0
+        skipped = 0
+
+        for i, session in enumerate(sessions, 1):
+            session_id = session["session_id"]
+            state_id   = session["state_id"]
+            year_start = session["year_start"]
+            title      = session.get("session_title", "")
+
+            # Check if import needed
+            if do_load and not args.dry_run and conn and not needs_import(session, conn):
+                log.debug(f"  [{i}/{total}] Session {session_id} ({title}) — hash unchanged, skipping.")
+                skipped += 1
+                continue
+
+            log.info(f"[{i}/{total}] Session {session_id}: {title}")
+
+            # Download
+            zip_bytes = None
+            if do_fetch:
+                try:
+                    zip_bytes, api_called = download_dataset_zip(session, dry_run=args.dry_run)
+                    if api_called:
+                        time.sleep(RATE_LIMIT_DELAY)
+                except Exception as e:
+                    log.error(f"  Download failed for session {session_id}: {e}")
+                    continue
+            elif do_load:
+                # Load from cache only
+                cache_path = CACHE_DIR / f"{session_id}_{session['dataset_hash']}.zip"
+                if not cache_path.exists():
+                    log.warning(f"  Cache miss for session {session_id} — run --fetch first.")
+                    continue
+                zip_bytes = cache_path.read_bytes()
+
+            # Derive state abbreviation from ZIP structure
+            state_abbr = args.state
+            if zip_bytes and not state_abbr:
+                try:
+                    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+                        state_abbr = _state_abbr_from_zip(zf)
+                except Exception:
+                    state_abbr = "??"
+
+            # Upsert session record
+            if do_load and not args.dry_run and conn and state_abbr:
+                upsert_session(session, state_abbr, conn, dry_run=args.dry_run)
+
+            # Load bills
+            if do_load and zip_bytes:
+                try:
+                    n = process_dataset(
+                        session, zip_bytes, conn,
+                        state_abbr=state_abbr,
+                        dry_run=args.dry_run,
+                        verbose=args.verbose,
+                    )
+                    total_bills += n
+                except Exception as e:
+                    log.error(f"  Load failed for session {session_id}: {e}")
+                    if conn:
+                        conn.rollback()
+
+        log.info(f"Fetch/load complete. Bills processed: {total_bills:,}. Skipped (up-to-date): {skipped}.")
+
+    # ── Tag ────────────────────────────────────────────────────────────────
+    if do_tag and not (do_fetch or do_load):
+        if args.dry_run or conn:
+            retag_all_bills(conn, dry_run=args.dry_run, verbose=args.verbose)
+
+    # ── Summary ────────────────────────────────────────────────────────────
+    if conn and not args.dry_run:
+        print_summary(conn)
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()