Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions
--- a/scripts/analyze_cables_concentration.py
+++ b/scripts/analyze_cables_concentration.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+"""Quick statistical analysis: are US data centers spatially tied to submarine
+cables, and does the resulting pattern look like concentrated costs / dispersed
+benefits?
+"""
+import math
+import os
+import statistics
+from collections import Counter
+
+import psycopg2
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
+    s = sorted(xs)
+    n = len(s)
+    out = {}
+    for q in qs:
+        if n == 0:
+            out[q] = None
+            continue
+        k = (n - 1) * q
+        lo, hi = math.floor(k), math.ceil(k)
+        if lo == hi:
+            out[q] = s[int(k)]
+        else:
+            out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
+    return out
+
+
+def gini(values):
+    """Standard Gini coefficient for non-negative values."""
+    v = sorted(x for x in values if x is not None and x >= 0)
+    n = len(v)
+    if n == 0 or sum(v) == 0:
+        return None
+    cum = 0.0
+    for i, x in enumerate(v, 1):
+        cum += i * x
+    return (2 * cum) / (n * sum(v)) - (n + 1) / n
+
+
+def hhi(shares):
+    """Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
+    return sum(s * s for s in shares)
+
+
+def mann_whitney_u_z(xs, ys):
+    """Approximate Mann-Whitney U test z-score (normal approx, large-n).
+    Returns (U, z, p_two_sided). Uses average ranks for ties.
+    """
+    combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
+    combined.sort(key=lambda t: t[0])
+    ranks = [0.0] * len(combined)
+    i = 0
+    n = len(combined)
+    while i < n:
+        j = i
+        while j + 1 < n and combined[j + 1][0] == combined[i][0]:
+            j += 1
+        avg_rank = (i + j) / 2 + 1
+        for k in range(i, j + 1):
+            ranks[k] = avg_rank
+        i = j + 1
+    r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
+    n1, n2 = len(xs), len(ys)
+    U1 = r1 - n1 * (n1 + 1) / 2
+    mu = n1 * n2 / 2
+    sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
+    z = (U1 - mu) / sigma if sigma else 0.0
+    # Two-sided p via error function
+    p = math.erfc(abs(z) / math.sqrt(2))
+    return U1, z, p
+
+
+def main():
+    conn = connect()
+    cur = conn.cursor()
+
+    # --- 1. Distance from each US data center to nearest submarine cable ---
+    cur.execute(
+        """
+        with cables_union as (
+            select ST_Union(geom)::geography as g from public.internet_cables
+        )
+        select ST_Distance(
+            ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
+            cu.g
+        ) / 1000.0  -- meters -> km
+        from public.us_dc_sample_geocoded dc, cables_union cu
+        where dc.longitude is not null and dc.latitude is not null
+          and (dc.country = 'United States' or dc.country is null)
+        """
+    )
+    dc_km = [float(r[0]) for r in cur.fetchall()]
+
+    # --- 2. Distance from US city-dominance points to nearest cable ---
+    cur.execute(
+        """
+        with cables_union as (
+            select ST_Union(geom)::geography as g from public.internet_cables
+        )
+        select ST_Distance(
+            ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
+            cu.g
+        ) / 1000.0
+        from public.internet_city_dominance c, cables_union cu
+        where c.country = 'US' and c.geom is not null
+        """
+    )
+    city_km = [float(r[0]) for r in cur.fetchall()]
+
+    # --- 3. DC distribution by state (cost concentration) ---
+    cur.execute(
+        """
+        select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
+        from public.us_dc_sample_geocoded
+        where longitude is not null and latitude is not null
+        group by 1
+        """
+    )
+    state_counts = dict(cur.fetchall())
+    total_dc = sum(state_counts.values())
+    state_shares = {k: v / total_dc for k, v in state_counts.items()}
+
+    # --- 4. IP distribution across US cities (benefit dispersion proxy) ---
+    cur.execute(
+        """
+        select city, coalesce(logical_dominance_ips, 0)
+        from public.internet_city_dominance
+        where country = 'US' and logical_dominance_ips is not null
+        """
+    )
+    city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
+    total_ips = sum(v for _, v in city_ips)
+    ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
+
+    # --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
+    # Top-N US dominance cities, share of national IPs each captures.
+    top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
+
+    cur.close()
+    conn.close()
+
+    # ======= report =======
+    print("=" * 70)
+    print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
+    print("=" * 70)
+    print(f"\nN data centers analyzed: {len(dc_km):,}")
+    print(f"N US city-dominance pts: {len(city_km):,}")
+
+    def fmt_q(label, xs):
+        q = quantiles(xs)
+        print(f"\n{label}:")
+        print(f"  mean = {statistics.mean(xs):,.1f} km")
+        print(f"  median (p50) = {q[0.5]:,.1f} km")
+        print(f"  p10 / p25 / p75 / p90 = "
+              f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
+        for thr in (10, 50, 100, 250):
+            frac = sum(1 for x in xs if x <= thr) / len(xs)
+            print(f"  share within {thr:>3} km of a cable: {frac*100:5.1f}%")
+
+    fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
+    fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
+
+    U, z, p = mann_whitney_u_z(dc_km, city_km)
+    print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
+          f"p≈{p:.2e}")
+    if statistics.median(dc_km) < statistics.median(city_km):
+        diff = statistics.median(city_km) - statistics.median(dc_km)
+        print(f"  -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
+    else:
+        print("  -> DCs are not closer to cables than cities.")
+
+    print("\n" + "=" * 70)
+    print("CONCENTRATION OF COSTS (data centers by state)")
+    print("=" * 70)
+    g_dc = gini(list(state_counts.values()))
+    h_dc = hhi(list(state_shares.values()))
+    print(f"States covered: {len(state_counts)}")
+    print(f"Gini of DC counts across states: {g_dc:.3f}  (0=even, 1=one state takes all)")
+    print(f"HHI of state shares:              {h_dc:.3f}  (0.18+ = highly concentrated)")
+    top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
+    cum = 0.0
+    print(f"\nTop states by share of US data centers:")
+    for st, s in top_states:
+        cum += s
+        print(f"  {st}: {s*100:5.1f}%  ({state_counts[st]:>4} DCs)  cum={cum*100:5.1f}%")
+
+    print("\n" + "=" * 70)
+    print("DISPERSION OF BENEFITS (US IPs across cities)")
+    print("=" * 70)
+    g_ip = gini([v for _, v in city_ips])
+    h_ip = hhi(ip_shares)
+    print(f"US cities with IP data: {len(city_ips):,}")
+    print(f"Gini of IPs across cities: {g_ip:.3f}")
+    print(f"HHI of IP shares:          {h_ip:.3f}")
+    cum = 0.0
+    print(f"\nTop US cities by share of national IPs:")
+    for city, ips in top_ip_cities:
+        s = ips / total_ips
+        cum += s
+        print(f"  {city:<30}  {s*100:5.2f}%  ({ips:>11,} IPs)  cum={cum*100:5.2f}%")
+
+    print("\n" + "=" * 70)
+    print("INTERPRETATION")
+    print("=" * 70)
+    print(f"""
+Cost concentration (DCs across states):   Gini={g_dc:.3f}  HHI={h_dc:.3f}
+Benefit dispersion (IPs across cities):   Gini={g_ip:.3f}  HHI={h_ip:.3f}
+
+A "concentrated costs / dispersed benefits" pattern requires:
+  (1) DCs cluster in a few places (high state-level Gini/HHI).
+  (2) Users they serve span many places (low city-level Gini/HHI, ideally).
+  (3) That clustering is plausibly tied to fixed infrastructure (cables).
+
+Check signs above:
+  - DC location vs cable proximity: see Mann-Whitney result.
+  - Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
+  - Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
+    the asymmetry (benefits more evenly distributed than costs).
+""")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/analyze_dc_tract_concentration.py
+++ b/scripts/analyze_dc_tract_concentration.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
+for US data-center siting.
+
+Cost-bearing universe = tracts that host at least one DC
+  (public.data_center_census_tracts_2024)
+Comparison universe = ACS 2024 5-yr tracts in the selected states
+  (census_tract_acs_2024_selected_states.csv)
+"""
+import csv
+import math
+import os
+import statistics
+from collections import Counter
+
+import psycopg2
+
+
+CSV_PATH = "census_tract_acs_2024_selected_states.csv"
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+def gini(values):
+    v = sorted(x for x in values if x is not None and x >= 0)
+    n = len(v)
+    if n == 0 or sum(v) == 0:
+        return None
+    cum = sum(i * x for i, x in enumerate(v, 1))
+    return (2 * cum) / (n * sum(v)) - (n + 1) / n
+
+
+def hhi(shares):
+    return sum(s * s for s in shares)
+
+
+def median(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def mean(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.mean(xs) if xs else None
+
+
+def wmean(xs, ws):
+    pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
+    if not pairs:
+        return None
+    total = sum(w for _, w in pairs)
+    return sum(x * w for x, w in pairs) / total
+
+
+def to_float(s):
+    try:
+        return float(s)
+    except (TypeError, ValueError):
+        return None
+
+
+def to_int(s):
+    try:
+        return int(float(s))
+    except (TypeError, ValueError):
+        return None
+
+
+def main():
+    conn = connect()
+    cur = conn.cursor()
+
+    # DC-hosting tracts (the cost-bearing universe) ----------------------
+    cur.execute(
+        """
+        select
+            geoid,
+            statefp,
+            data_center_count,
+            population,
+            households,
+            broadband_subscription_pct,
+            median_household_income,
+            per_capita_income,
+            poverty_rate,
+            non_hispanic_white_pct,
+            non_hispanic_black_pct,
+            hispanic_latino_pct,
+            non_hispanic_asian_pct,
+            primary_industry,
+            land_area_sqm,
+            industry_information_workers,
+            industry_total_workers
+        from public.data_center_census_tracts_2024
+        """
+    )
+    dc_tracts = []
+    for r in cur.fetchall():
+        dc_tracts.append(
+            {
+                "geoid": r[0],
+                "statefp": r[1],
+                "dc_count": r[2] or 0,
+                "pop": r[3],
+                "hh": r[4],
+                "broadband_pct": float(r[5]) if r[5] is not None else None,
+                "mhi": r[6],
+                "pci": r[7],
+                "poverty": float(r[8]) if r[8] is not None else None,
+                "white_pct": float(r[9]) if r[9] is not None else None,
+                "black_pct": float(r[10]) if r[10] is not None else None,
+                "hisp_pct": float(r[11]) if r[11] is not None else None,
+                "asian_pct": float(r[12]) if r[12] is not None else None,
+                "primary_industry": r[13],
+                "land_sqm": r[14],
+                "info_workers": r[15],
+                "total_workers": r[16],
+            }
+        )
+
+    # Distance from each DC tract to nearest cable (km) ----------------
+    cur.execute(
+        """
+        with cables as (select ST_Union(geom)::geography g from public.internet_cables)
+        select t.geoid,
+               ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
+        from public.data_center_census_tracts_2024 t, cables c
+        """
+    )
+    dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
+    for t in dc_tracts:
+        t["dist_km"] = dist_by_geoid.get(t["geoid"])
+
+    cur.close()
+    conn.close()
+
+    # Comparison universe from the wider ACS CSV ------------------------
+    universe = []
+    with open(CSV_PATH, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            universe.append(
+                {
+                    "geoid": row["geoid"],
+                    "statefp": row["statefp"],
+                    "pop": to_int(row["population"]),
+                    "broadband_pct": to_float(row["broadband_subscription_pct"]),
+                    "mhi": to_int(row["median_household_income"]),
+                    "pci": to_int(row["per_capita_income"]),
+                    "poverty": to_float(row["poverty_rate"]),
+                    "white_pct": to_float(row["non_hispanic_white_pct"]),
+                    "black_pct": to_float(row["non_hispanic_black_pct"]),
+                    "hisp_pct": to_float(row["hispanic_latino_pct"]),
+                    "asian_pct": to_float(row["non_hispanic_asian_pct"]),
+                }
+            )
+
+    dc_geoids = {t["geoid"] for t in dc_tracts}
+    non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
+
+    # Restrict comparison to states actually represented in the DC sample
+    dc_states = {t["statefp"] for t in dc_tracts}
+    universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
+    non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
+
+    # ============== report ==============
+    print("=" * 72)
+    print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
+    print("=" * 72)
+
+    total_dc = sum(t["dc_count"] for t in dc_tracts)
+    print(f"\nDC-hosting tracts:                  {len(dc_tracts):,}")
+    print(f"Data centers in those tracts:       {total_dc:,}")
+    print(f"ACS universe (selected states):     {len(universe):,} tracts")
+    print(f"States represented in DC sample:    {len(dc_states)}")
+    print(f"Universe restricted to DC states:   {len(universe_in_dc_states):,} tracts")
+
+    # --- Cost concentration at the tract level ---
+    print("\n" + "-" * 72)
+    print("1. COST CONCENTRATION (DCs across tracts)")
+    print("-" * 72)
+    counts = [t["dc_count"] for t in dc_tracts]
+    shares = [c / total_dc for c in counts]
+    g_dc = gini(counts)
+    h_dc = hhi(shares)
+    print(f"Gini of DC counts across DC-hosting tracts:       {g_dc:.3f}")
+    print(f"HHI of DC shares across DC-hosting tracts:        {h_dc:.4f}")
+    # Top 1% / 5% of tracts share
+    top1 = max(1, len(counts) // 100)
+    top5 = max(1, len(counts) // 20)
+    s = sorted(counts, reverse=True)
+    print(f"Top  1% of DC-hosting tracts ({top1:>3} tracts) hold "
+          f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
+    print(f"Top  5% of DC-hosting tracts ({top5:>3} tracts) hold "
+          f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
+    print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
+          f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
+
+    # How small a fraction of population lives in a DC tract?
+    pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
+    pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
+    print(f"\nPopulation living in a DC-hosting tract:          {pop_dc:>11,}")
+    print(f"Total population (DC-states ACS universe):        {pop_universe:>11,}")
+    if pop_universe:
+        print(f"  -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
+              f"live in a DC-hosting tract")
+    # Per-capita DC density
+    if pop_dc:
+        print(f"  -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
+    if pop_universe and total_dc:
+        print(f"     vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
+              f"averaged across DC-state population")
+
+    # --- Profile of cost-bearing communities ---
+    print("\n" + "-" * 72)
+    print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
+    print("-" * 72)
+    fields = [
+        ("Median household income ($)",      "mhi",            "{:>10,.0f}"),
+        ("Per-capita income ($)",            "pci",            "{:>10,.0f}"),
+        ("Broadband subscription (%)",       "broadband_pct",  "{:>10,.1f}"),
+        ("Poverty rate (%)",                 "poverty",        "{:>10,.1f}"),
+        ("Non-Hispanic White (%)",           "white_pct",      "{:>10,.1f}"),
+        ("Non-Hispanic Black (%)",           "black_pct",      "{:>10,.1f}"),
+        ("Hispanic/Latino (%)",              "hisp_pct",       "{:>10,.1f}"),
+        ("Non-Hispanic Asian (%)",           "asian_pct",      "{:>10,.1f}"),
+    ]
+    label_w = max(len(lbl) for lbl, *_ in fields)
+    print(f"{'Field':<{label_w}}  {'DC tracts':>12}  {'Non-DC peers':>14}  "
+          f"{'Δ (DC − peer)':>15}")
+    for label, key, fmt in fields:
+        dc_med = median([t[key] for t in dc_tracts])
+        peer_med = median([u[key] for u in non_dc_in_dc_states])
+        if dc_med is None or peer_med is None:
+            continue
+        delta = dc_med - peer_med
+        cell_dc = fmt.format(dc_med)
+        cell_pe = fmt.format(peer_med)
+        cell_dl = fmt.format(delta)
+        print(f"{label:<{label_w}}  {cell_dc}  {cell_pe}  {cell_dl}")
+
+    print("\nPopulation-weighted means (DC tracts):")
+    pops = [t["pop"] for t in dc_tracts]
+    for label, key, _ in fields:
+        wm = wmean([t[key] for t in dc_tracts], pops)
+        if wm is not None:
+            print(f"  {label:<{label_w}}  {wm:>12,.1f}")
+
+    print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
+    for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
+        print(f"  {n:>4}  {industry}")
+
+    # --- Cable vs. inland subgroups ---
+    print("\n" + "-" * 72)
+    print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
+    print("-" * 72)
+    near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
+    far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
+    print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
+          f"{sum(t['dc_count'] for t in near):>4} DCs")
+    print(f">100 km from a submarine cable: {len(far):>3} tracts, "
+          f"{sum(t['dc_count'] for t in far):>4} DCs")
+    if near and far:
+        print(f"{'  Median MHI':<28}  near={median([t['mhi'] for t in near]):>10,.0f}  "
+              f"far={median([t['mhi'] for t in far]):>10,.0f}")
+        print(f"{'  Median broadband %':<28}  near={median([t['broadband_pct'] for t in near]):>10,.1f}  "
+              f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
+        print(f"{'  Median DC count':<28}  near={median([t['dc_count'] for t in near]):>10,.0f}  "
+              f"far={median([t['dc_count'] for t in far]):>10,.0f}")
+
+    # --- Benefit-side proxy ---
+    print("\n" + "-" * 72)
+    print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
+    print("-" * 72)
+    # Total broadband subscribers approx = households * broadband_pct
+    subs = []
+    for u in universe_in_dc_states:
+        hh = None  # households not in CSV; use population/2.5 fallback
+        if u["pop"] and u["broadband_pct"] is not None:
+            est_hh = u["pop"] / 2.5
+            subs.append(est_hh * u["broadband_pct"] / 100.0)
+    total_subs = sum(subs)
+    sg = gini(subs)
+    sh = hhi([s / total_subs for s in subs]) if total_subs else None
+    print(f"Estimated total broadband subscribers (DC states):  {total_subs:>14,.0f}")
+    print(f"Gini of subscribers across {len(subs):,} tracts:       {sg:.3f}")
+    print(f"HHI of subscribers across tracts:                   {sh:.5f}")
+    # Compare to DC HHI
+    print(f"\nSide-by-side concentration (lower = more dispersed):")
+    print(f"  HHI of DCs across DC-hosting tracts:            {h_dc:.4f}")
+    print(f"  HHI of broadband subs across DC-state tracts:   {sh:.5f}  "
+          f"({h_dc/sh:.0f}x more concentrated for DCs)")
+
+    print("\n" + "=" * 72)
+    print("BOTTOM LINE")
+    print("=" * 72)
+    n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
+    print(f"""
+- DCs are extremely concentrated at the tract level: top 1% of host tracts
+  hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
+- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
+  live in a DC-hosting tract — costs (land use, power draw, water, traffic,
+  noise) fall on a tiny minority of communities.
+- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
+  tracts. See deltas above for the demographic profile.
+- Broadband subscribers (proxy for who consumes cloud services) are far more
+  evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
+  That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
+""")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/build_fcc_bdc_broadband_connection_table.py
+++ b/scripts/build_fcc_bdc_broadband_connection_table.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""Build data-center broadband connection tables.
+
+Creates a per-data-center broadband connection table and, when FCC BDC API
+credentials are available, stores the FCC BDC public download catalog.
+
+Required DB env vars:
+  PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD
+
+FCC API env vars:
+  FCC_USERNAME or FCC_BDC_USERNAME       - FCC User Registration username/email
+  FCC_API_KEY or FCC_HASH_VALUE          - BDC public API hash_value token
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any
+
+import psycopg2
+import requests
+from psycopg2.extras import Json, execute_values
+
+
+DB_NAME = "data_centers"
+
+MASTER_TABLE = "public.master_data_centers"
+TRACT_TABLE = "public.data_center_census_tracts_2024"
+AS_OF_TABLE = "public.fcc_bdc_api_as_of_dates"
+FILES_TABLE = "public.fcc_bdc_availability_files"
+CONNECTION_TABLE = "public.data_center_broadband_connection"
+
+FCC_BASE_URL = "https://broadbandmap.fcc.gov/api/public"
+USER_AGENT = "data-center-fcc-bdc-loader/1.0"
+
+
+def load_zsh_secrets() -> None:
+    """Load shell secrets into this process without printing values."""
+    secrets = Path.home() / ".zsh_secrets"
+    if not secrets.exists():
+        return
+
+    result = subprocess.run(
+        ["zsh", "-lc", "source ~/.zsh_secrets >/dev/null 2>&1; env"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    for line in result.stdout.splitlines():
+        if "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        if key and key not in os.environ:
+            os.environ[key] = value
+
+
+def require_env(keys: list[str]) -> None:
+    missing = [k for k in keys if not os.getenv(k)]
+    if missing:
+        raise RuntimeError("Missing required env vars: " + ", ".join(missing))
+
+
+def get_conn():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+def fcc_credentials() -> tuple[str | None, str | None]:
+    username = os.getenv("FCC_USERNAME") or os.getenv("FCC_BDC_USERNAME")
+    hash_value = os.getenv("FCC_API_KEY") or os.getenv("FCC_HASH_VALUE")
+    return username, hash_value
+
+
+def fcc_get(path: str, *, params: dict[str, Any] | None = None) -> dict[str, Any]:
+    username, hash_value = fcc_credentials()
+    if not username or not hash_value:
+        raise RuntimeError(
+            "FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
+            "FCC_API_KEY or FCC_HASH_VALUE."
+        )
+
+    url = f"{FCC_BASE_URL}{path}"
+    headers = {
+        "username": username,
+        "hash_value": hash_value,
+        "user-agent": USER_AGENT,
+        "accept": "application/json",
+    }
+    response = requests.get(url, headers=headers, params=params or {}, timeout=60)
+    response.raise_for_status()
+    payload = response.json()
+    if str(payload.get("status_code")) in {"401", "403"} or payload.get("status") == "fail":
+        raise RuntimeError(f"FCC API error for {path}: {payload}")
+    return payload
+
+
+def parse_date(value: Any) -> date | None:
+    if value in (None, ""):
+        return None
+    if isinstance(value, date):
+        return value
+    return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()
+
+
+def to_int(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(str(value).replace(",", ""))
+    except (TypeError, ValueError):
+        return None
+
+
+def create_tables(cur) -> None:
+    cur.execute("create extension if not exists postgis")
+
+    cur.execute(
+        f"""
+        create table if not exists {AS_OF_TABLE} (
+            data_type text not null,
+            as_of_date date not null,
+            raw jsonb not null,
+            fetched_at timestamptz not null default now(),
+            primary key (data_type, as_of_date)
+        )
+        """
+    )
+
+    cur.execute(
+        f"""
+        create table if not exists {FILES_TABLE} (
+            as_of_date date not null,
+            file_id bigint not null,
+            category text,
+            subcategory text,
+            technology_type text,
+            technology_code text,
+            technology_code_desc text,
+            speed_tier text,
+            state_fips text,
+            state_name text,
+            provider_id bigint,
+            provider_name text,
+            file_type text,
+            file_name text,
+            record_count bigint,
+            raw jsonb not null,
+            fetched_at timestamptz not null default now(),
+            primary key (as_of_date, file_id)
+        )
+        """
+    )
+    cur.execute(
+        f"create index if not exists fcc_bdc_availability_files_category_idx "
+        f"on {FILES_TABLE} (category, subcategory)"
+    )
+    cur.execute(
+        f"create index if not exists fcc_bdc_availability_files_state_idx "
+        f"on {FILES_TABLE} (state_fips)"
+    )
+    cur.execute(
+        f"create index if not exists fcc_bdc_availability_files_provider_idx "
+        f"on {FILES_TABLE} (provider_id)"
+    )
+
+    cur.execute(
+        f"""
+        create table if not exists {CONNECTION_TABLE} (
+            master_id text primary key references public.master_data_centers(master_id) on delete cascade,
+            source text,
+            name text,
+            operator text,
+            city text,
+            state text,
+            country text,
+            longitude double precision,
+            latitude double precision,
+            geom geometry(Point, 4326),
+
+            census_tract_geoid text,
+            census_broadband_subscription_pct numeric,
+
+            fcc_bdc_status text not null,
+            fcc_bdc_as_of_date date,
+            fcc_bdc_geography_type text,
+            fcc_bdc_geoid text,
+
+            fcc_provider_count integer,
+            fcc_fiber_provider_count integer,
+            fcc_cable_provider_count integer,
+            fcc_fixed_wireless_provider_count integer,
+            fcc_max_advertised_download_mbps numeric,
+            fcc_max_advertised_upload_mbps numeric,
+            fcc_100_20_provider_count integer,
+            fcc_summary_json jsonb,
+
+            fetched_at timestamptz not null default now(),
+            updated_at timestamptz not null default now()
+        )
+        """
+    )
+    cur.execute(
+        f"create index if not exists data_center_broadband_connection_geom_gix "
+        f"on {CONNECTION_TABLE} using gist (geom)"
+    )
+    cur.execute(
+        f"create index if not exists data_center_broadband_connection_tract_idx "
+        f"on {CONNECTION_TABLE} (census_tract_geoid)"
+    )
+    cur.execute(
+        f"create index if not exists data_center_broadband_connection_status_idx "
+        f"on {CONNECTION_TABLE} (fcc_bdc_status)"
+    )
+
+
+def rebuild_connection_base(cur, status: str) -> int:
+    cur.execute(f"truncate {CONNECTION_TABLE}")
+    cur.execute(
+        f"""
+        insert into {CONNECTION_TABLE} (
+            master_id, source, name, operator, city, state, country,
+            longitude, latitude, geom,
+            census_tract_geoid, census_broadband_subscription_pct,
+            fcc_bdc_status
+        )
+        select
+            dc.master_id, dc.source, dc.name, dc.operator, dc.city, dc.state, dc.country,
+            dc.longitude, dc.latitude, dc.geom,
+            dc.geoid as census_tract_geoid,
+            tr.broadband_subscription_pct as census_broadband_subscription_pct,
+            %s as fcc_bdc_status
+        from {MASTER_TABLE} dc
+        left join {TRACT_TABLE} tr on tr.geoid::text = dc.geoid::text
+        """
+        ,
+        (status,),
+    )
+    cur.execute(f"select count(*) from {CONNECTION_TABLE}")
+    return cur.fetchone()[0]
+
+
+def latest_availability_date(rows: list[dict[str, Any]]) -> date | None:
+    dates = [
+        parse_date(r.get("as_of_date"))
+        for r in rows
+        if str(r.get("data_type", "")).lower() in {"availability", "availability data"}
+    ]
+    dates = [d for d in dates if d is not None]
+    return max(dates) if dates else None
+
+
+def load_as_of_dates(cur) -> date:
+    payload = fcc_get("/map/listAsOfDates")
+    rows = payload.get("data") or []
+    values = []
+    for row in rows:
+        as_of_date = parse_date(row.get("as_of_date"))
+        if not as_of_date:
+            continue
+        values.append((row.get("data_type"), as_of_date, Json(row)))
+
+    if values:
+        execute_values(
+            cur,
+            f"""
+            insert into {AS_OF_TABLE} (data_type, as_of_date, raw)
+            values %s
+            on conflict (data_type, as_of_date) do update set
+                raw = excluded.raw,
+                fetched_at = now()
+            """,
+            values,
+            page_size=1000,
+        )
+
+    latest = latest_availability_date(rows)
+    if latest is None:
+        raise RuntimeError(f"Could not find an availability as_of_date in FCC response: {rows}")
+    return latest
+
+
+def load_availability_file_catalog(cur, as_of_date: date) -> int:
+    payload = fcc_get(
+        f"/map/downloads/listAvailabilityData/{as_of_date:%Y-%m-%d}",
+        params={"technology_type": "Fixed Broadband"},
+    )
+    rows = payload.get("data") or []
+    values = []
+    for row in rows:
+        file_id = to_int(row.get("file_id"))
+        if file_id is None:
+            continue
+        values.append(
+            (
+                as_of_date,
+                file_id,
+                row.get("category"),
+                row.get("subcategory"),
+                row.get("technology_type"),
+                row.get("technology_code"),
+                row.get("technology_code_desc"),
+                row.get("speed_tier"),
+                row.get("state_fips"),
+                row.get("state_name"),
+                to_int(row.get("provider_id")),
+                row.get("provider_name"),
+                row.get("file_type"),
+                row.get("file_name"),
+                to_int(row.get("record_count")),
+                Json(row),
+            )
+        )
+
+    if values:
+        cur.execute(f"delete from {FILES_TABLE} where as_of_date = %s", (as_of_date,))
+        execute_values(
+            cur,
+            f"""
+            insert into {FILES_TABLE} (
+                as_of_date, file_id, category, subcategory, technology_type,
+                technology_code, technology_code_desc, speed_tier, state_fips,
+                state_name, provider_id, provider_name, file_type, file_name,
+                record_count, raw
+            )
+            values %s
+            """,
+            values,
+            page_size=1000,
+        )
+    return len(values)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--skip-fcc", action="store_true", help="Only create/rebuild the base connection table.")
+    parser.add_argument("--as-of-date", help="FCC BDC availability as-of date, YYYY-MM-DD. Defaults to latest.")
+    args = parser.parse_args()
+
+    load_zsh_secrets()
+    require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
+
+    username, hash_value = fcc_credentials()
+    status = "pending_fcc_username" if hash_value and not username else "pending_fcc_catalog"
+    if args.skip_fcc:
+        status = "fcc_skipped"
+
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            create_tables(cur)
+            n_connection = rebuild_connection_base(cur, status)
+            print(f"{CONNECTION_TABLE}: {n_connection:,} base rows")
+
+            if args.skip_fcc:
+                conn.commit()
+                return 0
+
+            if not username or not hash_value:
+                print(
+                    "FCC catalog not loaded: set FCC_USERNAME or FCC_BDC_USERNAME "
+                    "alongside FCC_API_KEY/FCC_HASH_VALUE in ~/.zsh_secrets.",
+                    file=sys.stderr,
+                )
+                conn.commit()
+                return 2
+
+            as_of_date = parse_date(args.as_of_date) if args.as_of_date else load_as_of_dates(cur)
+            n_files = load_availability_file_catalog(cur, as_of_date)
+
+            cur.execute(
+                f"""
+                update {CONNECTION_TABLE}
+                set fcc_bdc_status = 'fcc_catalog_loaded',
+                    fcc_bdc_as_of_date = %s,
+                    updated_at = now()
+                """,
+                (as_of_date,),
+            )
+            conn.commit()
+
+    print(f"{AS_OF_TABLE}: loaded latest availability date {as_of_date}")
+    print(f"{FILES_TABLE}: {n_files:,} fixed-broadband file catalog rows")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_fcc_bdc_location_provider_aggregates.py
+++ b/scripts/build_fcc_bdc_location_provider_aggregates.py
@@ -0,0 +1,806 @@
+#!/usr/bin/env python3
+"""Build FCC BDC provider aggregates for data-center counties and tracts.
+
+This script uses FCC BDC State / Location Coverage files. Those files are
+provider/location-level and include block GEOIDs, so they can be aggregated to
+county and tract provider counts for only the geographies that contain data
+centers.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import tempfile
+import zipfile
+from collections.abc import Iterable
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import requests
+from psycopg2.extras import execute_values
+
+from build_fcc_bdc_broadband_connection_table import (
+    CONNECTION_TABLE,
+    FCC_BASE_URL,
+    FILES_TABLE,
+    USER_AGENT,
+    fcc_credentials,
+    get_conn,
+    load_zsh_secrets,
+    parse_date,
+    require_env,
+)
+
+
+DETAIL_TABLE = "public.fcc_bdc_location_provider_geography_provider"
+AGG_TABLE = "public.fcc_bdc_location_provider_aggregate"
+PROGRESS_TABLE = "public.fcc_bdc_location_provider_file_progress"
+CROSSWALK_TABLE = "public.fcc_bdc_geoid_crosswalk"
+
+TERRESTRIAL_TECHNOLOGY_CODES = ("10", "40", "50", "70", "71", "72")
+FIXED_WIRELESS_CODES = {"70", "71", "72"}
+
+CSV_USECOLS = [
+    "provider_id",
+    "block_geoid",
+    "technology",
+    "max_advertised_download_speed",
+    "max_advertised_upload_speed",
+    "business_residential_code",
+]
+
+CT_PLANNING_TO_LEGACY_TRACT_GEOIDS = {
+    "09110520302": "09003520302",
+    "09120090500": "09001090500",
+    "09170175800": "09009175800",
+    "09190020101": "09001020101",
+    "09190020900": "09001020900",
+    "09190044300": "09001044300",
+}
+
+
+def fcc_download_headers() -> dict[str, str]:
+    username, hash_value = fcc_credentials()
+    if not username or not hash_value:
+        raise RuntimeError(
+            "FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
+            "FCC_API_KEY or FCC_HASH_VALUE."
+        )
+    return {
+        "username": username,
+        "hash_value": hash_value,
+        "user-agent": USER_AGENT,
+        "accept": "application/zip,*/*",
+    }
+
+
+def normalize_codes(values: Iterable[str]) -> tuple[str, ...]:
+    return tuple(str(v).strip() for v in values if str(v).strip())
+
+
+def create_tables(cur) -> None:
+    cur.execute(
+        f"""
+        create table if not exists {DETAIL_TABLE} (
+            as_of_date date not null,
+            file_id bigint not null,
+            geography_type text not null check (geography_type in ('County', 'Tract')),
+            geoid text not null,
+            provider_id bigint not null,
+            has_fiber boolean not null default false,
+            has_cable boolean not null default false,
+            has_fixed_wireless boolean not null default false,
+            has_copper boolean not null default false,
+            has_100_20 boolean not null default false,
+            has_business boolean not null default false,
+            has_business_fiber boolean not null default false,
+            has_business_100_20 boolean not null default false,
+            max_advertised_download_mbps numeric,
+            max_advertised_upload_mbps numeric,
+            matched_location_rows bigint not null default 0,
+            updated_at timestamptz not null default now(),
+            primary key (as_of_date, file_id, geography_type, geoid, provider_id)
+        )
+        """
+    )
+    cur.execute(
+        f"create index if not exists fcc_bdc_location_provider_geo_idx "
+        f"on {DETAIL_TABLE} (as_of_date, geography_type, geoid)"
+    )
+
+    cur.execute(
+        f"""
+        create table if not exists {AGG_TABLE} (
+            as_of_date date not null,
+            geography_type text not null check (geography_type in ('County', 'Tract')),
+            geoid text not null,
+            provider_count integer not null,
+            fiber_provider_count integer not null,
+            cable_provider_count integer not null,
+            fixed_wireless_provider_count integer not null,
+            copper_provider_count integer not null,
+            provider_100_20_count integer not null,
+            business_provider_count integer not null,
+            business_fiber_provider_count integer not null,
+            business_100_20_provider_count integer not null,
+            max_advertised_download_mbps numeric,
+            max_advertised_upload_mbps numeric,
+            matched_location_rows bigint not null,
+            provider_file_rows bigint not null,
+            updated_at timestamptz not null default now(),
+            primary key (as_of_date, geography_type, geoid)
+        )
+        """
+    )
+
+    cur.execute(
+        f"""
+        create table if not exists {PROGRESS_TABLE} (
+            as_of_date date not null,
+            file_id bigint not null,
+            state_fips text not null,
+            technology_code text,
+            technology_code_desc text,
+            record_count bigint,
+            matched_location_rows bigint not null,
+            provider_geo_rows bigint not null,
+            processed_at timestamptz not null default now(),
+            primary key (as_of_date, file_id)
+        )
+        """
+    )
+
+    cur.execute(
+        f"""
+        create table if not exists {CROSSWALK_TABLE} (
+            source_geography_type text not null,
+            source_geoid text not null,
+            fcc_geography_type text not null,
+            fcc_geoid text not null,
+            method text not null,
+            notes text,
+            updated_at timestamptz not null default now(),
+            primary key (source_geography_type, source_geoid, fcc_geography_type)
+        )
+        """
+    )
+
+    add_columns = [
+        "fcc_provider_geography_type text",
+        "fcc_provider_geoid text",
+        "fcc_county_provider_count integer",
+        "fcc_county_fiber_provider_count integer",
+        "fcc_county_cable_provider_count integer",
+        "fcc_county_fixed_wireless_provider_count integer",
+        "fcc_county_100_20_provider_count integer",
+        "fcc_county_business_provider_count integer",
+        "fcc_county_business_fiber_provider_count integer",
+        "fcc_county_business_100_20_provider_count integer",
+        "fcc_county_max_advertised_download_mbps numeric",
+        "fcc_county_max_advertised_upload_mbps numeric",
+        "fcc_tract_provider_count integer",
+        "fcc_tract_fiber_provider_count integer",
+        "fcc_tract_cable_provider_count integer",
+        "fcc_tract_fixed_wireless_provider_count integer",
+        "fcc_tract_100_20_provider_count integer",
+        "fcc_tract_business_provider_count integer",
+        "fcc_tract_business_fiber_provider_count integer",
+        "fcc_tract_business_100_20_provider_count integer",
+        "fcc_tract_max_advertised_download_mbps numeric",
+        "fcc_tract_max_advertised_upload_mbps numeric",
+    ]
+    for definition in add_columns:
+        cur.execute(f"alter table {CONNECTION_TABLE} add column if not exists {definition}")
+
+
+def seed_geoid_crosswalk(cur) -> None:
+    values = [
+        (
+            "Tract",
+            source_geoid,
+            "Tract",
+            fcc_geoid,
+            "ct_planning_region_to_legacy_county_same_tractce",
+            "Connecticut 2024 tract GEOIDs use planning-region county equivalents; FCC BDC block GEOIDs use legacy county codes.",
+        )
+        for source_geoid, fcc_geoid in CT_PLANNING_TO_LEGACY_TRACT_GEOIDS.items()
+    ]
+    execute_values(
+        cur,
+        f"""
+        insert into {CROSSWALK_TABLE} (
+            source_geography_type, source_geoid, fcc_geography_type,
+            fcc_geoid, method, notes
+        )
+        values %s
+        on conflict (source_geography_type, source_geoid, fcc_geography_type)
+        do update set
+            fcc_geoid = excluded.fcc_geoid,
+            method = excluded.method,
+            notes = excluded.notes,
+            updated_at = now()
+        """,
+        values,
+    )
+
+
+def latest_catalog_date(cur) -> date:
+    cur.execute(f"select max(as_of_date) from {FILES_TABLE}")
+    value = cur.fetchone()[0]
+    if value is None:
+        raise RuntimeError(f"No FCC catalog rows found in {FILES_TABLE}. Run the FCC catalog load first.")
+    return value
+
+
+def target_geographies(cur, states: tuple[str, ...] | None = None) -> tuple[set[str], set[str], set[str]]:
+    state_filter = ""
+    params: list[Any] = []
+    if states:
+        state_filter = "where left(census_tract_geoid, 2) = any(%s)"
+        params.append(list(states))
+    cur.execute(
+        f"""
+        select distinct
+            left(census_tract_geoid, 2) as state_fips,
+            left(census_tract_geoid, 5) as county_geoid,
+            left(census_tract_geoid, 11) as tract_geoid
+        from {CONNECTION_TABLE}
+        {state_filter}
+        """,
+        params,
+    )
+    rows = cur.fetchall()
+    states_found = {r[0] for r in rows if r[0]}
+    counties = {r[1] for r in rows if r[1]}
+    tracts = {r[2] for r in rows if r[2]}
+
+    if tracts:
+        cur.execute(
+            f"""
+            select fcc_geoid
+            from {CROSSWALK_TABLE}
+            where source_geography_type = 'Tract'
+              and fcc_geography_type = 'Tract'
+              and source_geoid = any(%s)
+            """,
+            (list(tracts),),
+        )
+        fcc_tracts = {r[0] for r in cur.fetchall() if r[0]}
+        tracts.update(fcc_tracts)
+        counties.update({geoid[:5] for geoid in fcc_tracts})
+
+    return states_found, counties, tracts
+
+
+def catalog_files(
+    cur,
+    as_of_date: date,
+    states: set[str],
+    technology_codes: tuple[str, ...],
+    limit: int | None,
+) -> list[dict[str, Any]]:
+    cur.execute(
+        f"""
+        select file_id, state_fips, technology_code, technology_code_desc, file_name, record_count
+        from {FILES_TABLE}
+        where as_of_date = %s
+          and category = 'State'
+          and subcategory = 'Location Coverage'
+          and state_fips = any(%s)
+          and technology_code = any(%s)
+        order by state_fips, technology_code, file_id
+        """,
+        (as_of_date, list(states), list(technology_codes)),
+    )
+    rows = [
+        {
+            "file_id": int(file_id),
+            "state_fips": state_fips,
+            "technology_code": str(technology_code),
+            "technology_code_desc": technology_code_desc,
+            "file_name": file_name,
+            "record_count": record_count,
+        }
+        for file_id, state_fips, technology_code, technology_code_desc, file_name, record_count in cur.fetchall()
+    ]
+    return rows[:limit] if limit is not None else rows
+
+
+def progress_done(cur, as_of_date: date, file_id: int) -> bool:
+    cur.execute(
+        f"select 1 from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
+        (as_of_date, file_id),
+    )
+    return cur.fetchone() is not None
+
+
+def download_file(file_id: int, dest_dir: Path) -> Path:
+    url = f"{FCC_BASE_URL}/map/downloads/downloadFile/availability/{file_id}"
+    path = dest_dir / f"fcc_bdc_availability_{file_id}.zip"
+    with requests.get(url, headers=fcc_download_headers(), stream=True, timeout=(15, 300)) as response:
+        response.raise_for_status()
+        with path.open("wb") as fh:
+            for chunk in response.iter_content(chunk_size=1024 * 1024):
+                if chunk:
+                    fh.write(chunk)
+    return path
+
+
+def normalize_block_geoid(series: pd.Series) -> pd.Series:
+    return series.astype("string").str.replace(r"\.0$", "", regex=True).str.zfill(15)
+
+
+def summarize_matches(
+    chunk: pd.DataFrame,
+    geography_type: str,
+    target_geoids: set[str],
+) -> tuple[pd.DataFrame, int]:
+    geoid_len = 5 if geography_type == "County" else 11
+    geoid = chunk["block_geoid_norm"].str[:geoid_len]
+    matched = chunk[geoid.isin(target_geoids)].copy()
+    if matched.empty:
+        return pd.DataFrame(), 0
+
+    matched["geoid"] = geoid[matched.index]
+    matched["provider_id_num"] = pd.to_numeric(matched["provider_id"], errors="coerce")
+    matched = matched[matched["provider_id_num"].notna()].copy()
+    if matched.empty:
+        return pd.DataFrame(), 0
+
+    tech = matched["technology"].astype("string").str.replace(r"\.0$", "", regex=True)
+    down = pd.to_numeric(matched["max_advertised_download_speed"], errors="coerce")
+    upload = pd.to_numeric(matched["max_advertised_upload_speed"], errors="coerce")
+    business_code = matched["business_residential_code"].astype("string").str.upper().fillna("")
+    business = business_code.isin(["B", "X"])
+
+    matched["provider_id_num"] = matched["provider_id_num"].astype("int64")
+    matched["has_fiber"] = tech.eq("50")
+    matched["has_cable"] = tech.eq("40")
+    matched["has_fixed_wireless"] = tech.isin(FIXED_WIRELESS_CODES)
+    matched["has_copper"] = tech.eq("10")
+    matched["has_100_20"] = down.ge(100) & upload.ge(20)
+    matched["has_business"] = business
+    matched["has_business_fiber"] = business & matched["has_fiber"]
+    matched["has_business_100_20"] = business & matched["has_100_20"]
+    matched["max_down"] = down
+    matched["max_up"] = upload
+    matched["matched_location_rows"] = 1
+
+    grouped = (
+        matched.groupby(["geoid", "provider_id_num"], as_index=False)
+        .agg(
+            has_fiber=("has_fiber", "max"),
+            has_cable=("has_cable", "max"),
+            has_fixed_wireless=("has_fixed_wireless", "max"),
+            has_copper=("has_copper", "max"),
+            has_100_20=("has_100_20", "max"),
+            has_business=("has_business", "max"),
+            has_business_fiber=("has_business_fiber", "max"),
+            has_business_100_20=("has_business_100_20", "max"),
+            max_down=("max_down", "max"),
+            max_up=("max_up", "max"),
+            matched_location_rows=("matched_location_rows", "sum"),
+        )
+    )
+    return grouped, len(matched)
+
+
+def upsert_detail(
+    cur,
+    as_of_date: date,
+    file_id: int,
+    geography_type: str,
+    grouped: pd.DataFrame,
+) -> int:
+    if grouped.empty:
+        return 0
+
+    values = [
+        (
+            as_of_date,
+            file_id,
+            geography_type,
+            row.geoid,
+            int(row.provider_id_num),
+            bool(row.has_fiber),
+            bool(row.has_cable),
+            bool(row.has_fixed_wireless),
+            bool(row.has_copper),
+            bool(row.has_100_20),
+            bool(row.has_business),
+            bool(row.has_business_fiber),
+            bool(row.has_business_100_20),
+            None if pd.isna(row.max_down) else float(row.max_down),
+            None if pd.isna(row.max_up) else float(row.max_up),
+            int(row.matched_location_rows),
+        )
+        for row in grouped.itertuples(index=False)
+    ]
+    execute_values(
+        cur,
+        f"""
+        insert into {DETAIL_TABLE} (
+            as_of_date, file_id, geography_type, geoid, provider_id,
+            has_fiber, has_cable, has_fixed_wireless, has_copper,
+            has_100_20, has_business, has_business_fiber, has_business_100_20,
+            max_advertised_download_mbps, max_advertised_upload_mbps,
+            matched_location_rows
+        )
+        values %s
+        on conflict (as_of_date, file_id, geography_type, geoid, provider_id)
+        do update set
+            has_fiber = {DETAIL_TABLE}.has_fiber or excluded.has_fiber,
+            has_cable = {DETAIL_TABLE}.has_cable or excluded.has_cable,
+            has_fixed_wireless = {DETAIL_TABLE}.has_fixed_wireless or excluded.has_fixed_wireless,
+            has_copper = {DETAIL_TABLE}.has_copper or excluded.has_copper,
+            has_100_20 = {DETAIL_TABLE}.has_100_20 or excluded.has_100_20,
+            has_business = {DETAIL_TABLE}.has_business or excluded.has_business,
+            has_business_fiber = {DETAIL_TABLE}.has_business_fiber or excluded.has_business_fiber,
+            has_business_100_20 = {DETAIL_TABLE}.has_business_100_20 or excluded.has_business_100_20,
+            max_advertised_download_mbps = greatest(
+                {DETAIL_TABLE}.max_advertised_download_mbps,
+                excluded.max_advertised_download_mbps
+            ),
+            max_advertised_upload_mbps = greatest(
+                {DETAIL_TABLE}.max_advertised_upload_mbps,
+                excluded.max_advertised_upload_mbps
+            ),
+            matched_location_rows = {DETAIL_TABLE}.matched_location_rows + excluded.matched_location_rows,
+            updated_at = now()
+        """,
+        values,
+        page_size=1000,
+    )
+    return len(values)
+
+
+def process_file(
+    conn,
+    file_row: dict[str, Any],
+    as_of_date: date,
+    county_geoids: set[str],
+    tract_geoids: set[str],
+    chunksize: int,
+    temp_dir: Path,
+) -> tuple[int, int]:
+    file_id = file_row["file_id"]
+    zip_path = download_file(file_id, temp_dir)
+    matched_rows = 0
+    provider_geo_rows = 0
+
+    try:
+        with zipfile.ZipFile(zip_path) as archive:
+            csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
+            if not csv_names:
+                raise RuntimeError(f"FCC file_id={file_id} did not contain a CSV: {archive.namelist()}")
+            with archive.open(csv_names[0]) as csv_file:
+                reader = pd.read_csv(
+                    csv_file,
+                    usecols=CSV_USECOLS,
+                    dtype="string",
+                    chunksize=chunksize,
+                    low_memory=False,
+                )
+                with conn.cursor() as cur:
+                    cur.execute(
+                        f"delete from {DETAIL_TABLE} where as_of_date = %s and file_id = %s",
+                        (as_of_date, file_id),
+                    )
+                    cur.execute(
+                        f"delete from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
+                        (as_of_date, file_id),
+                    )
+                    conn.commit()
+
+                for chunk_number, chunk in enumerate(reader, start=1):
+                    chunk["block_geoid_norm"] = normalize_block_geoid(chunk["block_geoid"])
+
+                    county_grouped, county_matches = summarize_matches(chunk, "County", county_geoids)
+                    tract_grouped, tract_matches = summarize_matches(chunk, "Tract", tract_geoids)
+
+                    with conn.cursor() as cur:
+                        provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "County", county_grouped)
+                        provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "Tract", tract_grouped)
+                    conn.commit()
+
+                    matched_rows += county_matches + tract_matches
+                    if matched_rows and chunk_number % 10 == 0:
+                        print(f"  file_id={file_id}: chunk {chunk_number:,}, matched row-events={matched_rows:,}")
+
+        with conn.cursor() as cur:
+            cur.execute(
+                f"""
+                insert into {PROGRESS_TABLE} (
+                    as_of_date, file_id, state_fips, technology_code,
+                    technology_code_desc, record_count, matched_location_rows, provider_geo_rows
+                )
+                values (%s, %s, %s, %s, %s, %s, %s, %s)
+                on conflict (as_of_date, file_id) do update set
+                    state_fips = excluded.state_fips,
+                    technology_code = excluded.technology_code,
+                    technology_code_desc = excluded.technology_code_desc,
+                    record_count = excluded.record_count,
+                    matched_location_rows = excluded.matched_location_rows,
+                    provider_geo_rows = excluded.provider_geo_rows,
+                    processed_at = now()
+                """,
+                (
+                    as_of_date,
+                    file_id,
+                    file_row["state_fips"],
+                    file_row["technology_code"],
+                    file_row["technology_code_desc"],
+                    file_row["record_count"],
+                    matched_rows,
+                    provider_geo_rows,
+                ),
+            )
+        conn.commit()
+        return matched_rows, provider_geo_rows
+    finally:
+        zip_path.unlink(missing_ok=True)
+
+
+def rebuild_aggregate(cur, as_of_date: date) -> int:
+    cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
+    cur.execute(
+        f"""
+        insert into {AGG_TABLE} (
+            as_of_date, geography_type, geoid,
+            provider_count, fiber_provider_count, cable_provider_count,
+            fixed_wireless_provider_count, copper_provider_count,
+            provider_100_20_count, business_provider_count,
+            business_fiber_provider_count, business_100_20_provider_count,
+            max_advertised_download_mbps, max_advertised_upload_mbps,
+            matched_location_rows, provider_file_rows
+        )
+        with per_provider as (
+            select
+                as_of_date,
+                geography_type,
+                geoid,
+                provider_id,
+                bool_or(has_fiber) as has_fiber,
+                bool_or(has_cable) as has_cable,
+                bool_or(has_fixed_wireless) as has_fixed_wireless,
+                bool_or(has_copper) as has_copper,
+                bool_or(has_100_20) as has_100_20,
+                bool_or(has_business) as has_business,
+                bool_or(has_business_fiber) as has_business_fiber,
+                bool_or(has_business_100_20) as has_business_100_20,
+                max(max_advertised_download_mbps) as max_advertised_download_mbps,
+                max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
+                sum(matched_location_rows) as matched_location_rows,
+                count(*) as provider_file_rows
+            from {DETAIL_TABLE}
+            where as_of_date = %s
+            group by 1, 2, 3, 4
+        )
+        select
+            as_of_date,
+            geography_type,
+            geoid,
+            count(*)::integer as provider_count,
+            count(*) filter (where has_fiber)::integer as fiber_provider_count,
+            count(*) filter (where has_cable)::integer as cable_provider_count,
+            count(*) filter (where has_fixed_wireless)::integer as fixed_wireless_provider_count,
+            count(*) filter (where has_copper)::integer as copper_provider_count,
+            count(*) filter (where has_100_20)::integer as provider_100_20_count,
+            count(*) filter (where has_business)::integer as business_provider_count,
+            count(*) filter (where has_business_fiber)::integer as business_fiber_provider_count,
+            count(*) filter (where has_business_100_20)::integer as business_100_20_provider_count,
+            max(max_advertised_download_mbps) as max_advertised_download_mbps,
+            max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
+            sum(matched_location_rows)::bigint as matched_location_rows,
+            sum(provider_file_rows)::bigint as provider_file_rows
+        from per_provider
+        group by 1, 2, 3
+        """,
+        (as_of_date,),
+    )
+    return cur.rowcount
+
+
+def update_connection_table(cur, as_of_date: date) -> int:
+    cur.execute(
+        f"""
+        with joined as (
+            select
+                c.master_id,
+                coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11)) as provider_tract_geoid,
+                coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5)) as provider_county_geoid,
+                county.geoid as county_geoid,
+                tract.geoid as tract_geoid,
+                county.provider_count as county_provider_count,
+                county.fiber_provider_count as county_fiber_provider_count,
+                county.cable_provider_count as county_cable_provider_count,
+                county.fixed_wireless_provider_count as county_fixed_wireless_provider_count,
+                county.provider_100_20_count as county_100_20_provider_count,
+                county.business_provider_count as county_business_provider_count,
+                county.business_fiber_provider_count as county_business_fiber_provider_count,
+                county.business_100_20_provider_count as county_business_100_20_provider_count,
+                county.max_advertised_download_mbps as county_max_down,
+                county.max_advertised_upload_mbps as county_max_up,
+                tract.provider_count as tract_provider_count,
+                tract.fiber_provider_count as tract_fiber_provider_count,
+                tract.cable_provider_count as tract_cable_provider_count,
+                tract.fixed_wireless_provider_count as tract_fixed_wireless_provider_count,
+                tract.provider_100_20_count as tract_100_20_provider_count,
+                tract.business_provider_count as tract_business_provider_count,
+                tract.business_fiber_provider_count as tract_business_fiber_provider_count,
+                tract.business_100_20_provider_count as tract_business_100_20_provider_count,
+                tract.max_advertised_download_mbps as tract_max_down,
+                tract.max_advertised_upload_mbps as tract_max_up
+            from {CONNECTION_TABLE} c
+            left join {CROSSWALK_TABLE} x
+                on x.source_geography_type = 'Tract'
+               and x.fcc_geography_type = 'Tract'
+               and x.source_geoid = c.census_tract_geoid
+            left join {AGG_TABLE} county
+                on county.as_of_date = %s
+               and county.geography_type = 'County'
+               and county.geoid = coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5))
+            left join {AGG_TABLE} tract
+                on tract.as_of_date = %s
+               and tract.geography_type = 'Tract'
+               and tract.geoid = coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11))
+        )
+        update {CONNECTION_TABLE} c
+        set
+            fcc_provider_geography_type = case
+                when j.tract_geoid is not null then 'Tract'
+                when j.county_geoid is not null then 'County'
+                else c.fcc_provider_geography_type
+            end,
+            fcc_provider_geoid = coalesce(j.tract_geoid, j.county_geoid, c.fcc_provider_geoid),
+            fcc_provider_count = coalesce(j.tract_provider_count, j.county_provider_count),
+            fcc_fiber_provider_count = coalesce(j.tract_fiber_provider_count, j.county_fiber_provider_count),
+            fcc_cable_provider_count = coalesce(j.tract_cable_provider_count, j.county_cable_provider_count),
+            fcc_fixed_wireless_provider_count = coalesce(j.tract_fixed_wireless_provider_count, j.county_fixed_wireless_provider_count),
+            fcc_100_20_provider_count = coalesce(j.tract_100_20_provider_count, j.county_100_20_provider_count),
+            fcc_max_advertised_download_mbps = coalesce(j.tract_max_down, j.county_max_down, c.fcc_max_advertised_download_mbps),
+            fcc_max_advertised_upload_mbps = coalesce(j.tract_max_up, j.county_max_up, c.fcc_max_advertised_upload_mbps),
+            fcc_county_provider_count = j.county_provider_count,
+            fcc_county_fiber_provider_count = j.county_fiber_provider_count,
+            fcc_county_cable_provider_count = j.county_cable_provider_count,
+            fcc_county_fixed_wireless_provider_count = j.county_fixed_wireless_provider_count,
+            fcc_county_100_20_provider_count = j.county_100_20_provider_count,
+            fcc_county_business_provider_count = j.county_business_provider_count,
+            fcc_county_business_fiber_provider_count = j.county_business_fiber_provider_count,
+            fcc_county_business_100_20_provider_count = j.county_business_100_20_provider_count,
+            fcc_county_max_advertised_download_mbps = j.county_max_down,
+            fcc_county_max_advertised_upload_mbps = j.county_max_up,
+            fcc_tract_provider_count = j.tract_provider_count,
+            fcc_tract_fiber_provider_count = j.tract_fiber_provider_count,
+            fcc_tract_cable_provider_count = j.tract_cable_provider_count,
+            fcc_tract_fixed_wireless_provider_count = j.tract_fixed_wireless_provider_count,
+            fcc_tract_100_20_provider_count = j.tract_100_20_provider_count,
+            fcc_tract_business_provider_count = j.tract_business_provider_count,
+            fcc_tract_business_fiber_provider_count = j.tract_business_fiber_provider_count,
+            fcc_tract_business_100_20_provider_count = j.tract_business_100_20_provider_count,
+            fcc_tract_max_advertised_download_mbps = j.tract_max_down,
+            fcc_tract_max_advertised_upload_mbps = j.tract_max_up,
+            fcc_summary_json = jsonb_set(
+                coalesce(c.fcc_summary_json, '{{}}'::jsonb),
+                '{{location_provider_aggregate}}',
+                jsonb_build_object(
+                    'source', 'fcc_state_location_coverage',
+                    'as_of_date', %s::text,
+                    'preferred_geography_type', case
+                        when j.tract_geoid is not null then 'Tract'
+                        when j.county_geoid is not null then 'County'
+                        else null
+                    end,
+                    'preferred_geoid', coalesce(j.tract_geoid, j.county_geoid),
+                    'county_geoid', j.county_geoid,
+                    'tract_geoid', j.tract_geoid
+                ),
+                true
+            ),
+            fcc_bdc_status = case
+                when coalesce(j.tract_geoid, j.county_geoid) is not null then 'fcc_location_provider_joined'
+                else c.fcc_bdc_status
+            end,
+            updated_at = now()
+        from joined j
+        where c.master_id = j.master_id
+          and coalesce(j.tract_geoid, j.county_geoid) is not null
+        """,
+        (as_of_date, as_of_date, as_of_date),
+    )
+    return cur.rowcount
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--as-of-date", help="FCC availability as-of date; defaults to latest catalog date.")
+    parser.add_argument("--states", nargs="*", help="Optional state FIPS list, e.g. 11 34 51.")
+    parser.add_argument("--technology-codes", nargs="*", default=list(TERRESTRIAL_TECHNOLOGY_CODES))
+    parser.add_argument("--limit-files", type=int, help="Process only the first N matching files.")
+    parser.add_argument("--chunksize", type=int, default=500_000)
+    parser.add_argument("--refresh", action="store_true", help="Delete existing location-provider rows for this as-of date first.")
+    parser.add_argument("--no-resume", action="store_true", help="Reprocess files even if marked complete.")
+    parser.add_argument("--no-update-connection", action="store_true", help="Build aggregate tables but do not update data_center_broadband_connection.")
+    args = parser.parse_args()
+
+    load_zsh_secrets()
+    require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
+
+    as_of_date = parse_date(args.as_of_date) if args.as_of_date else None
+    if as_of_date is None and args.as_of_date:
+        raise RuntimeError(f"Invalid --as-of-date: {args.as_of_date}")
+
+    technology_codes = normalize_codes(args.technology_codes)
+    requested_states = tuple(s.zfill(2) for s in args.states) if args.states else None
+
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            create_tables(cur)
+            seed_geoid_crosswalk(cur)
+            as_of_date = as_of_date or latest_catalog_date(cur)
+            states, counties, tracts = target_geographies(cur, requested_states)
+            if not states:
+                raise RuntimeError("No target data-center states found.")
+            if args.refresh:
+                cur.execute(f"delete from {DETAIL_TABLE} where as_of_date = %s", (as_of_date,))
+                cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
+                cur.execute(f"delete from {PROGRESS_TABLE} where as_of_date = %s", (as_of_date,))
+            files = catalog_files(cur, as_of_date, states, technology_codes, args.limit_files)
+        conn.commit()
+
+        print(f"FCC as_of_date: {as_of_date}")
+        print(f"Target states: {len(states):,} | counties: {len(counties):,} | tracts: {len(tracts):,}")
+        print(f"Location coverage files selected: {len(files):,}")
+
+        total_matched_rows = 0
+        total_provider_geo_rows = 0
+        with tempfile.TemporaryDirectory(prefix="fcc_bdc_location_") as temp:
+            temp_dir = Path(temp)
+            for idx, file_row in enumerate(files, start=1):
+                file_id = file_row["file_id"]
+                with conn.cursor() as cur:
+                    skip = (not args.no_resume) and progress_done(cur, as_of_date, file_id)
+                if skip:
+                    print(f"[{idx:,}/{len(files):,}] skip file_id={file_id} already processed")
+                    continue
+
+                print(
+                    f"[{idx:,}/{len(files):,}] file_id={file_id} state={file_row['state_fips']} "
+                    f"tech={file_row['technology_code']} records={file_row['record_count']:,}"
+                )
+                matched_rows, provider_geo_rows = process_file(
+                    conn,
+                    file_row,
+                    as_of_date,
+                    counties,
+                    tracts,
+                    args.chunksize,
+                    temp_dir,
+                )
+                total_matched_rows += matched_rows
+                total_provider_geo_rows += provider_geo_rows
+                print(
+                    f"  complete file_id={file_id}: matched row-events={matched_rows:,}, "
+                    f"provider-geography rows={provider_geo_rows:,}"
+                )
+
+        with conn.cursor() as cur:
+            agg_rows = rebuild_aggregate(cur, as_of_date)
+            updated_rows = 0
+            if not args.no_update_connection:
+                updated_rows = update_connection_table(cur, as_of_date)
+        conn.commit()
+
+    print(f"New matched row-events this run: {total_matched_rows:,}")
+    print(f"New provider-geography detail rows this run: {total_provider_geo_rows:,}")
+    print(f"{AGG_TABLE}: rebuilt {agg_rows:,} geography rows")
+    if not args.no_update_connection:
+        print(f"{CONNECTION_TABLE}: updated {updated_rows:,} rows with location-provider aggregates")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_master_data_centers.py
+++ b/scripts/build_master_data_centers.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Build (or refresh) public.master_data_centers by merging:
+  - public.us_dc_sample_geocoded   (curated, attribute-rich)
+  - public.osm_data_centers        (OpenStreetMap features)
+
+Deduplication rule (curated row wins):
+  Step 1: for each curated row, find a matching OSM row by
+            curated.id = osm.osm_id::text                                OR
+            curated.nominatim_osm_id = osm.osm_id                        OR
+            ST_DWithin(curated.geom, osm.geom, 150 m, geography)
+          (closest match by sphere distance when multiple).
+  Step 2: insert every curated row into master, filling NULLs from the
+          matched OSM row when present. source = 'merged' if matched,
+          otherwise 'curated'.
+  Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
+          source = 'osm'.
+
+Result: every curated row appears once; OSM-only rows appear once; no row is
+emitted twice. The merge logic lives in a SQL function
+public.refresh_master_data_centers() so subsequent refreshes are one call.
+"""
+import argparse
+import os
+import sys
+
+import psycopg2
+
+DB_NAME = "data_centers"
+MASTER_TABLE = "public.master_data_centers"
+CURATED_TABLE = "public.us_dc_sample_geocoded"
+OSM_TABLE = "public.osm_data_centers"
+MATCH_RADIUS_M = 150
+
+
+CREATE_TABLE_SQL = f"""
+create table if not exists {MASTER_TABLE} (
+    master_id              text primary key,
+    source                 text not null check (source in ('curated','osm','merged')),
+    curated_id             text,
+    osm_id                 text,
+    name                   text,
+    operator               text,
+    street_address         text,
+    city                   text,
+    state                  text,
+    postal_code            text,
+    country                text,
+    website                text,
+    phone                  text,
+    power_mw               numeric,
+    area_sqft              integer,
+    nearest_airport_miles  numeric,
+    has_bare_metal         boolean,
+    has_iaas               boolean,
+    has_internet_exchange  boolean,
+    has_colocation         boolean,
+    certifications         text,
+    content_summary        text,
+    osm_tags               jsonb,
+    matched_osm_tag_passes text[],
+    match_method           text,
+    match_distance_m       numeric,
+    longitude              double precision not null,
+    latitude               double precision not null,
+    geom                   geometry(Point, 4326)
+        generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
+);
+create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
+create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
+create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
+create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
+create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
+"""
+
+
+REFRESH_FUNCTION_SQL = f"""
+create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
+returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
+language plpgsql
+as $$
+begin
+    truncate table {MASTER_TABLE};
+
+    -- pick a single best OSM match for each curated row, prioritizing ID
+    -- equality, then nominatim id, then closest within radius
+    create temporary table _curated_to_osm on commit drop as
+    with ranked as (
+        select
+            c.id                                  as curated_id,
+            o.id                                  as osm_id,
+            case
+                when c.id = o.osm_id::text       then 'id'
+                when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
+                else 'spatial'
+            end                                  as method,
+            ST_DistanceSphere(c.geom, o.geom)    as dist_m,
+            row_number() over (
+                partition by c.id
+                order by
+                    case
+                        when c.id = o.osm_id::text       then 0
+                        when c.nominatim_osm_id = o.osm_id then 1
+                        else 2
+                    end,
+                    ST_DistanceSphere(c.geom, o.geom) asc
+            )                                    as rn
+        from {CURATED_TABLE} c
+        join {OSM_TABLE} o
+          on c.id = o.osm_id::text
+          or c.nominatim_osm_id = o.osm_id
+          or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
+    )
+    select curated_id, osm_id, method, dist_m
+    from ranked
+    where rn = 1;
+
+    -- Step 1+2: insert curated rows (with OSM nulls filled where matched)
+    insert into {MASTER_TABLE} (
+        master_id, source, curated_id, osm_id,
+        name, operator, street_address, city, state, postal_code, country,
+        website, phone, power_mw, area_sqft, nearest_airport_miles,
+        has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
+        certifications, content_summary,
+        osm_tags, matched_osm_tag_passes,
+        match_method, match_distance_m,
+        longitude, latitude
+    )
+    select
+        'curated/' || c.id,
+        case when m.osm_id is not null then 'merged' else 'curated' end,
+        c.id,
+        m.osm_id,
+        coalesce(c.facility_name, o.name),
+        coalesce(c.provider, o.operator),
+        coalesce(c.street_address, o.street_address),
+        coalesce(c.city, o.city),
+        coalesce(c.state_code, o.state),
+        coalesce(c.postal_code, o.postal_code),
+        coalesce(c.country, o.country),
+        coalesce(c.url, o.website),
+        coalesce(c.phone, o.phone),
+        c.power_mw,
+        c.area_sqft,
+        c.nearest_airport_miles,
+        c.has_bare_metal,
+        c.has_iaas,
+        c.has_internet_exchange,
+        c.has_colocation,
+        c.certifications,
+        c.content_summary,
+        o.tags,
+        o.matched_tags,
+        m.method,
+        round(m.dist_m::numeric, 2),
+        c.longitude,
+        c.latitude
+    from {CURATED_TABLE} c
+    left join _curated_to_osm m on m.curated_id = c.id
+    left join {OSM_TABLE} o on o.id = m.osm_id;
+
+    -- Step 3: insert OSM rows that no curated row claimed
+    insert into {MASTER_TABLE} (
+        master_id, source, curated_id, osm_id,
+        name, operator, street_address, city, state, postal_code, country,
+        website, phone,
+        osm_tags, matched_osm_tag_passes,
+        longitude, latitude
+    )
+    select
+        'osm/' || o.id,
+        'osm',
+        null,
+        o.id,
+        o.name,
+        o.operator,
+        o.street_address,
+        o.city,
+        o.state,
+        o.postal_code,
+        o.country,
+        o.website,
+        o.phone,
+        o.tags,
+        o.matched_tags,
+        o.longitude,
+        o.latitude
+    from {OSM_TABLE} o
+    where not exists (
+        select 1 from _curated_to_osm m where m.osm_id = o.id
+    );
+
+    analyze {MASTER_TABLE};
+
+    return query
+    select
+        count(*) filter (where source = 'curated'),
+        count(*) filter (where source = 'merged'),
+        count(*) filter (where source = 'osm'),
+        count(*)
+    from {MASTER_TABLE};
+end;
+$$;
+"""
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--radius-m",
+        type=float,
+        default=MATCH_RADIUS_M,
+        help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
+    )
+    parser.add_argument(
+        "--recreate",
+        action="store_true",
+        help=f"Drop and recreate {MASTER_TABLE} before building.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    conn = psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+    try:
+        with conn:
+            with conn.cursor() as cur:
+                cur.execute("create extension if not exists postgis")
+                if args.recreate:
+                    cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
+                cur.execute(CREATE_TABLE_SQL)
+                cur.execute(REFRESH_FUNCTION_SQL)
+                cur.execute(
+                    "select * from public.refresh_master_data_centers(%s)",
+                    (args.radius_m,),
+                )
+                curated, merged, osm_only, total = cur.fetchone()
+    finally:
+        conn.close()
+
+    print(f"master_data_centers refreshed (radius={args.radius_m} m):")
+    print(f"  curated-only rows: {curated}")
+    print(f"  merged rows (curated + OSM):  {merged}")
+    print(f"  osm-only rows:     {osm_only}")
+    print(f"  total:             {total}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/build_watershed_huc8_tables.py
+++ b/scripts/build_watershed_huc8_tables.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import subprocess
+from pathlib import Path
+
+import psycopg2
+
+
+DB_NAME = "data_centers"
+TRACT_TABLE = "public.data_center_census_tracts_2024"
+STAGE_TABLE = "public._watershed_huc8_stage"
+HUC8_TABLE = "public.watershed_huc8"
+LINK_TABLE = "public.census_tract_huc8_link"
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+def import_huc8_shapefile(shapefile_path):
+    conn_str = (
+        f"PG:host={os.environ['PGWEB_HOST']} "
+        f"port={os.environ['PGWEB_PORT']} "
+        f"user={os.environ['PGWEB_USER']} "
+        f"password={os.environ['PGWEB_PASSWORD']} "
+        f"dbname={DB_NAME}"
+    )
+    source = str(shapefile_path.resolve())
+
+    cmd = [
+        "ogr2ogr",
+        "-f",
+        "PostgreSQL",
+        conn_str,
+        source,
+        "-nln",
+        STAGE_TABLE,
+        "-nlt",
+        "MULTIPOLYGON",
+        "-t_srs",
+        "EPSG:4326",
+        "-lco",
+        "GEOMETRY_NAME=geom",
+        "-lco",
+        "FID=gid",
+        "-lco",
+        "PRECISION=NO",
+        "-unsetFieldWidth",
+        "-skipfailures",
+        "-overwrite",
+    ]
+
+    subprocess.run(cmd, check=True)
+
+
+def build_final_tables(conn):
+    with conn:
+        with conn.cursor() as cur:
+            cur.execute(f"drop table if exists {HUC8_TABLE}")
+            cur.execute(
+                f"""
+                create table {HUC8_TABLE} as
+                select distinct on (huc8)
+                    huc8,
+                    name,
+                    states,
+                    areaacres,
+                    areasqkm,
+                    loaddate,
+                    sourceorig as sourceoriginator,
+                    sourcedata as sourcedatadesc,
+                    sourcefeat as sourcefeatureid,
+                    metasource as metasourceid,
+                    tnmid,
+                    geom::geometry(MultiPolygon, 4326) as geom
+                from {STAGE_TABLE}
+                where huc8 is not null
+                order by huc8, loaddate desc nulls last
+                """
+            )
+            cur.execute(f"alter table {HUC8_TABLE} add primary key (huc8)")
+            cur.execute(
+                f"create index watershed_huc8_geom_gix on {HUC8_TABLE} using gist (geom)"
+            )
+            cur.execute(
+                f"create index watershed_huc8_states_idx on {HUC8_TABLE} (states)"
+            )
+
+            cur.execute(f"drop table if exists {LINK_TABLE}")
+            cur.execute(
+                f"""
+                create table {LINK_TABLE} as
+                select
+                    geoid,
+                    huc8,
+                    overlap_sqm,
+                    overlap_sqm / 1000000.0 as overlap_sqkm,
+                    overlap_sqm / nullif(tract_sqm, 0.0) as tract_overlap_pct
+                from (
+                    select
+                        tr.geoid,
+                        wh.huc8,
+                        st_area(
+                            st_intersection(
+                                tr.geom::geography,
+                                wh.geom::geography
+                            )
+                        ) as overlap_sqm,
+                        st_area(tr.geom::geography) as tract_sqm
+                    from {TRACT_TABLE} tr
+                    join {HUC8_TABLE} wh
+                      on st_intersects(tr.geom, wh.geom)
+                ) as overlap_rows
+                where overlap_sqm > 0
+                """
+            )
+            cur.execute(
+                f"create index census_tract_huc8_link_geoid_idx on {LINK_TABLE} (geoid)"
+            )
+            cur.execute(
+                f"create index census_tract_huc8_link_huc8_idx on {LINK_TABLE} (huc8)"
+            )
+
+            cur.execute(f"analyze {STAGE_TABLE}")
+            cur.execute(f"analyze {HUC8_TABLE}")
+            cur.execute(f"analyze {LINK_TABLE}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Build watershed HUC8 boundaries and GEOID linkage tables from "
+            "a local HUC8 shapefile."
+        )
+    )
+    parser.add_argument(
+        "--shapefile",
+        default="HUC8_CONUS/HUC8_US.shp",
+        help="Path to the HUC8 shapefile to import.",
+    )
+    parser.add_argument(
+        "--build-only",
+        action="store_true",
+        help="Skip imports and rebuild final/link tables from existing stage data.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    shapefile_path = Path(args.shapefile)
+    if not args.build_only and not shapefile_path.exists():
+        raise FileNotFoundError(f"shapefile not found: {shapefile_path}")
+
+    if not args.build_only:
+        print(f"importing HUC8 shapefile from {shapefile_path}")
+        import_huc8_shapefile(shapefile_path)
+
+    conn = connect()
+    try:
+        build_final_tables(conn)
+        with conn.cursor() as cur:
+            cur.execute(f"select count(*) from {HUC8_TABLE}")
+            huc8_rows = cur.fetchone()[0]
+            cur.execute(f"select count(*) from {LINK_TABLE}")
+            link_rows = cur.fetchone()[0]
+    finally:
+        conn.close()
+
+    print(
+        f"done: source={shapefile_path}, huc8_rows={huc8_rows}, "
+        f"geoid_huc8_links={link_rows}"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/create_data_center_census_tract_table.py
+++ b/scripts/create_data_center_census_tract_table.py
@@ -0,0 +1,731 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import json
+import os
+import subprocess
+import urllib.parse
+import urllib.request
+from decimal import Decimal
+from pathlib import Path
+
+import psycopg2
+from psycopg2.extras import execute_values
+
+
+DB_NAME = "data_centers"
+POINT_TABLE = "public.master_data_centers"
+POINT_ID_COL = "master_id"
+BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
+ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
+FINAL_TABLE = "public.data_center_census_tracts_2024"
+
+ACS_YEAR = 2024
+ACS_SOURCE = "ACS 2024 5-year profile"
+TRACT_ZIP = Path("cb_2024_us_tract_500k.zip")
+TRACT_ZIP_URL = (
+    "https://www2.census.gov/geo/tiger/GENZ2024/shp/cb_2024_us_tract_500k.zip"
+)
+ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
+
+STATE_NAME_TO_CODE = {
+    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
+    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
+    "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
+    "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
+    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
+    "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
+    "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
+    "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
+    "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
+    "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
+    "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
+    "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
+    "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
+    "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
+    "Puerto Rico": "PR", "United States Virgin Islands": "VI",
+    "U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
+}
+
+STATE_FIPS = {
+    "AL": "01",
+    "AK": "02",
+    "AZ": "04",
+    "AR": "05",
+    "CA": "06",
+    "CO": "08",
+    "CT": "09",
+    "DE": "10",
+    "DC": "11",
+    "FL": "12",
+    "GA": "13",
+    "HI": "15",
+    "ID": "16",
+    "IL": "17",
+    "IN": "18",
+    "IA": "19",
+    "KS": "20",
+    "KY": "21",
+    "LA": "22",
+    "ME": "23",
+    "MD": "24",
+    "MA": "25",
+    "MI": "26",
+    "MN": "27",
+    "MS": "28",
+    "MO": "29",
+    "MT": "30",
+    "NE": "31",
+    "NV": "32",
+    "NH": "33",
+    "NJ": "34",
+    "NM": "35",
+    "NY": "36",
+    "NC": "37",
+    "ND": "38",
+    "OH": "39",
+    "OK": "40",
+    "OR": "41",
+    "PA": "42",
+    "RI": "44",
+    "SC": "45",
+    "SD": "46",
+    "TN": "47",
+    "TX": "48",
+    "UT": "49",
+    "VT": "50",
+    "VA": "51",
+    "WA": "53",
+    "WV": "54",
+    "WI": "55",
+    "WY": "56",
+    "AS": "60",
+    "GU": "66",
+    "MP": "69",
+    "PR": "72",
+    "VI": "78",
+}
+
+ACS_VARIABLES = {
+    "DP05_0001E": "population",
+    "DP05_0018E": "median_age",
+    "DP02_0001E": "households",
+    "DP02_0016E": "avg_household_size",
+    "DP02_0067PE": "high_school_or_higher_pct",
+    "DP02_0068PE": "bachelor_or_higher_pct",
+    "DP02_0154PE": "broadband_subscription_pct",
+    "DP03_0001E": "population_16_over",
+    "DP03_0002E": "labor_force",
+    "DP03_0005E": "unemployed",
+    "DP03_0009PE": "unemployment_rate",
+    "DP03_0032E": "industry_total_workers",
+    "DP03_0033E": "industry_agriculture_mining_workers",
+    "DP03_0034E": "industry_construction_workers",
+    "DP03_0035E": "industry_manufacturing_workers",
+    "DP03_0036E": "industry_wholesale_trade_workers",
+    "DP03_0037E": "industry_retail_trade_workers",
+    "DP03_0038E": "industry_transportation_warehousing_utilities_workers",
+    "DP03_0039E": "industry_information_workers",
+    "DP03_0040E": "industry_finance_real_estate_workers",
+    "DP03_0041E": "industry_professional_management_admin_workers",
+    "DP03_0042E": "industry_education_health_social_workers",
+    "DP03_0043E": "industry_arts_entertainment_food_workers",
+    "DP03_0044E": "industry_other_services_workers",
+    "DP03_0045E": "industry_public_administration_workers",
+    "DP03_0062E": "median_household_income",
+    "DP03_0088E": "per_capita_income",
+    "DP03_0119PE": "family_poverty_rate",
+    "DP03_0128PE": "poverty_rate",
+    "DP05_0090E": "hispanic_latino_population",
+    "DP05_0090PE": "hispanic_latino_pct",
+    "DP05_0096E": "non_hispanic_white_population",
+    "DP05_0096PE": "non_hispanic_white_pct",
+    "DP05_0097E": "non_hispanic_black_population",
+    "DP05_0097PE": "non_hispanic_black_pct",
+    "DP05_0099E": "non_hispanic_asian_population",
+    "DP05_0099PE": "non_hispanic_asian_pct",
+}
+
+COUNT_COLUMNS = {
+    "population",
+    "households",
+    "population_16_over",
+    "labor_force",
+    "unemployed",
+    "industry_total_workers",
+    "industry_agriculture_mining_workers",
+    "industry_construction_workers",
+    "industry_manufacturing_workers",
+    "industry_wholesale_trade_workers",
+    "industry_retail_trade_workers",
+    "industry_transportation_warehousing_utilities_workers",
+    "industry_information_workers",
+    "industry_finance_real_estate_workers",
+    "industry_professional_management_admin_workers",
+    "industry_education_health_social_workers",
+    "industry_arts_entertainment_food_workers",
+    "industry_other_services_workers",
+    "industry_public_administration_workers",
+    "median_household_income",
+    "per_capita_income",
+    "hispanic_latino_population",
+    "non_hispanic_white_population",
+    "non_hispanic_black_population",
+    "non_hispanic_asian_population",
+}
+
+NUMERIC_COLUMNS = set(ACS_VARIABLES.values()) - COUNT_COLUMNS
+
+INDUSTRY_COLUMNS = {
+    "industry_agriculture_mining_workers": "Agriculture, forestry, fishing and hunting, and mining",
+    "industry_construction_workers": "Construction",
+    "industry_manufacturing_workers": "Manufacturing",
+    "industry_wholesale_trade_workers": "Wholesale trade",
+    "industry_retail_trade_workers": "Retail trade",
+    "industry_transportation_warehousing_utilities_workers": "Transportation and warehousing, and utilities",
+    "industry_information_workers": "Information",
+    "industry_finance_real_estate_workers": "Finance and insurance, and real estate and rental and leasing",
+    "industry_professional_management_admin_workers": "Professional, scientific, management, administrative, and waste management services",
+    "industry_education_health_social_workers": "Educational services, and health care and social assistance",
+    "industry_arts_entertainment_food_workers": "Arts, entertainment, recreation, accommodation, and food services",
+    "industry_other_services_workers": "Other services, except public administration",
+    "industry_public_administration_workers": "Public administration",
+}
+
+SPECIAL_VALUES = {
+    "-666666666",
+    "-888888888",
+    "-999999999",
+    "-222222222",
+    "-333333333",
+    "-555555555",
+    "-666666666.0",
+    "-888888888.0",
+    "-999999999.0",
+    "-222222222.0",
+    "-333333333.0",
+    "-555555555.0",
+}
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+def normalize_state(value):
+    if value in (None, ""):
+        return None
+    if value in STATE_FIPS:
+        return value
+    return STATE_NAME_TO_CODE.get(value.strip())
+
+
+def get_state_fips(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
+        )
+        rows = cur.fetchall()
+    normalized_counts = {}
+    null_state_count = 0
+    unknown = []
+    for raw, count in rows:
+        if raw is None:
+            null_state_count += count
+            continue
+        code = normalize_state(raw)
+        if code is None:
+            unknown.append((raw, count))
+            continue
+        normalized_counts[code] = normalized_counts.get(code, 0) + count
+    if unknown:
+        details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
+        raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
+    if null_state_count:
+        print(
+            f"warning: {null_state_count} master_data_centers rows have NULL state; "
+            f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
+        )
+        # Census ACS 5-year DP profile lacks coverage for the small island territories;
+        # restrict to the 50 states + DC + PR which the ACS profile reliably serves.
+        allowed = {"AS", "GU", "MP", "VI"}
+        return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
+    return sorted({STATE_FIPS[code] for code in normalized_counts})
+
+
+def ensure_final_table_absent(conn):
+    with conn.cursor() as cur:
+        cur.execute("select to_regclass(%s)", (FINAL_TABLE,))
+        if cur.fetchone()[0] is not None:
+            raise RuntimeError(
+                f"Target table {FINAL_TABLE} already exists; refusing to overwrite it."
+            )
+
+
+def drop_final_table_if_exists(conn):
+    with conn:
+        with conn.cursor() as cur:
+            cur.execute(f"drop table if exists {FINAL_TABLE}")
+
+
+def download_tract_boundaries():
+    if TRACT_ZIP.exists() and TRACT_ZIP.stat().st_size > 50_000_000:
+        return
+    tmp_path = TRACT_ZIP.with_suffix(".zip.part")
+    with urllib.request.urlopen(TRACT_ZIP_URL, timeout=120) as response:
+        with tmp_path.open("wb") as out:
+            while True:
+                chunk = response.read(1024 * 1024)
+                if not chunk:
+                    break
+                out.write(chunk)
+    tmp_path.rename(TRACT_ZIP)
+
+
+def import_tract_boundaries(state_fips):
+    where = "STATEFP IN ({})".format(
+        ",".join(f"'{state}'" for state in sorted(state_fips))
+    )
+    env = os.environ.copy()
+    env.update(
+        {
+            "PGHOST": os.environ["PGWEB_HOST"],
+            "PGPORT": os.environ["PGWEB_PORT"],
+            "PGUSER": os.environ["PGWEB_USER"],
+            "PGPASSWORD": os.environ["PGWEB_PASSWORD"],
+            "PGDATABASE": DB_NAME,
+        }
+    )
+    cmd = [
+        "ogr2ogr",
+        "-f",
+        "PostgreSQL",
+        "PG:dbname=data_centers",
+        f"/vsizip/{TRACT_ZIP.resolve()}/cb_2024_us_tract_500k.shp",
+        "-nln",
+        BOUNDARY_STAGE_TABLE,
+        "-overwrite",
+        "-nlt",
+        "MULTIPOLYGON",
+        "-t_srs",
+        "EPSG:4326",
+        "-lco",
+        "GEOMETRY_NAME=geom",
+        "-lco",
+        "FID=gid",
+        "-where",
+        where,
+    ]
+    subprocess.run(cmd, check=True, env=env)
+
+
+def fetch_acs_for_state(state_fips):
+    variables = ["NAME", *ACS_VARIABLES.keys()]
+    params = {
+        "get": ",".join(variables),
+        "for": "tract:*",
+        "in": f"state:{state_fips} county:*",
+    }
+    api_key = os.environ.get("CENSUS_API_KEY")
+    if api_key:
+        params["key"] = api_key
+    url = (
+        f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
+        + urllib.parse.urlencode(params)
+    )
+    try:
+        with urllib.request.urlopen(url, timeout=120) as response:
+            body = response.read().decode("utf-8")
+    except urllib.error.HTTPError as exc:
+        body = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(
+            f"Census ACS request failed for state {state_fips}: HTTP {exc.code} — {body[:300]}"
+        ) from exc
+    try:
+        data = json.loads(body)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(
+            f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
+        ) from exc
+
+    header = data[0]
+    rows = []
+    for values in data[1:]:
+        raw = dict(zip(header, values))
+        row = {
+            "geoid": raw["state"] + raw["county"] + raw["tract"],
+            "acs_name": raw["NAME"],
+            "statefp": raw["state"],
+            "countyfp": raw["county"],
+            "tractce": raw["tract"],
+        }
+        for acs_var, column in ACS_VARIABLES.items():
+            row[column] = clean_acs_value(raw.get(acs_var), column)
+        add_primary_industry(row)
+        rows.append(row)
+    return rows
+
+
+def clean_acs_value(value, column):
+    if value in (None, "", "null") or value in SPECIAL_VALUES:
+        return None
+    if column in COUNT_COLUMNS:
+        return int(Decimal(value))
+    if column in NUMERIC_COLUMNS:
+        return Decimal(value)
+    return value
+
+
+def add_primary_industry(row):
+    industry_total = row.get("industry_total_workers")
+    best_column = None
+    best_value = None
+    for column in INDUSTRY_COLUMNS:
+        value = row.get(column)
+        if value is None:
+            continue
+        if best_value is None or value > best_value:
+            best_column = column
+            best_value = value
+
+    row["primary_industry"] = INDUSTRY_COLUMNS.get(best_column)
+    row["primary_industry_workers"] = best_value
+    if industry_total and best_value is not None:
+        row["primary_industry_pct"] = Decimal(best_value * 100) / Decimal(industry_total)
+    else:
+        row["primary_industry_pct"] = None
+
+
+def fetch_acs(state_fips):
+    rows = []
+    for state in state_fips:
+        rows.extend(fetch_acs_for_state(state))
+
+    fieldnames = [
+        "geoid",
+        "acs_name",
+        "statefp",
+        "countyfp",
+        "tractce",
+        *ACS_VARIABLES.values(),
+        "primary_industry",
+        "primary_industry_workers",
+        "primary_industry_pct",
+    ]
+    with ACS_AUDIT_CSV.open("w", newline="", encoding="utf-8") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+    return rows, fieldnames
+
+
+def load_acs_stage(conn, rows, fieldnames):
+    with conn:
+        with conn.cursor() as cur:
+            cur.execute(f"drop table if exists {ACS_STAGE_TABLE}")
+            cur.execute(
+                f"""
+                create table {ACS_STAGE_TABLE} (
+                    geoid text primary key,
+                    acs_name text,
+                    statefp text,
+                    countyfp text,
+                    tractce text,
+                    population integer,
+                    median_age numeric,
+                    households integer,
+                    avg_household_size numeric,
+                    high_school_or_higher_pct numeric,
+                    bachelor_or_higher_pct numeric,
+                    broadband_subscription_pct numeric,
+                    population_16_over integer,
+                    labor_force integer,
+                    unemployed integer,
+                    unemployment_rate numeric,
+                    industry_total_workers integer,
+                    industry_agriculture_mining_workers integer,
+                    industry_construction_workers integer,
+                    industry_manufacturing_workers integer,
+                    industry_wholesale_trade_workers integer,
+                    industry_retail_trade_workers integer,
+                    industry_transportation_warehousing_utilities_workers integer,
+                    industry_information_workers integer,
+                    industry_finance_real_estate_workers integer,
+                    industry_professional_management_admin_workers integer,
+                    industry_education_health_social_workers integer,
+                    industry_arts_entertainment_food_workers integer,
+                    industry_other_services_workers integer,
+                    industry_public_administration_workers integer,
+                    median_household_income integer,
+                    per_capita_income integer,
+                    family_poverty_rate numeric,
+                    poverty_rate numeric,
+                    hispanic_latino_population integer,
+                    hispanic_latino_pct numeric,
+                    non_hispanic_white_population integer,
+                    non_hispanic_white_pct numeric,
+                    non_hispanic_black_population integer,
+                    non_hispanic_black_pct numeric,
+                    non_hispanic_asian_population integer,
+                    non_hispanic_asian_pct numeric,
+                    primary_industry text,
+                    primary_industry_workers integer,
+                    primary_industry_pct numeric
+                )
+                """
+            )
+            values = [tuple(row.get(column) for column in fieldnames) for row in rows]
+            execute_values(
+                cur,
+                f"insert into {ACS_STAGE_TABLE} ({', '.join(fieldnames)}) values %s",
+                values,
+                page_size=1000,
+            )
+            cur.execute(f"analyze {ACS_STAGE_TABLE}")
+
+
+def create_final_table(conn):
+    with conn:
+        with conn.cursor() as cur:
+            cur.execute("drop index if exists _dc_census_tract_boundaries_2024_geom_gix")
+            cur.execute(
+                f"create index _dc_census_tract_boundaries_2024_geom_gix on {BOUNDARY_STAGE_TABLE} using gist (geom)"
+            )
+            cur.execute(f"analyze {BOUNDARY_STAGE_TABLE}")
+            cur.execute(
+                f"""
+                create table {FINAL_TABLE} as
+                with dc_tracts as (
+                    select
+                        t.geoid,
+                        count(*)::integer as data_center_count,
+                        count(*) filter (where dc.source = 'curated')::integer
+                            as curated_only_data_center_count,
+                        count(*) filter (where dc.source = 'merged')::integer
+                            as merged_data_center_count,
+                        count(*) filter (where dc.source = 'osm')::integer
+                            as osm_only_data_center_count,
+                        array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
+                        array_agg(distinct dc.operator) filter (where dc.operator is not null)
+                            as operators
+                    from {BOUNDARY_STAGE_TABLE} t
+                    join {POINT_TABLE} dc
+                        on t.geom && dc.geom
+                       and ST_Covers(t.geom, dc.geom)
+                    group by t.geoid
+                )
+                select
+                    t.geoid,
+                    t.statefp,
+                    t.countyfp,
+                    t.tractce,
+                    t.name as tract_name,
+                    t.namelsad,
+                    t.aland::bigint as land_area_sqm,
+                    t.awater::bigint as water_area_sqm,
+                    {ACS_YEAR}::integer as acs_year,
+                    '{ACS_SOURCE}'::text as acs_source,
+                    a.acs_name,
+                    d.data_center_count,
+                    d.curated_only_data_center_count,
+                    d.merged_data_center_count,
+                    d.osm_only_data_center_count,
+                    d.data_center_ids,
+                    d.operators,
+                    a.population,
+                    a.median_age,
+                    a.households,
+                    a.avg_household_size,
+                    a.high_school_or_higher_pct,
+                    a.bachelor_or_higher_pct,
+                    a.broadband_subscription_pct,
+                    a.population_16_over,
+                    a.labor_force,
+                    a.unemployed,
+                    a.unemployment_rate,
+                    a.median_household_income,
+                    a.per_capita_income,
+                    a.family_poverty_rate,
+                    a.poverty_rate,
+                    a.hispanic_latino_population,
+                    a.hispanic_latino_pct,
+                    a.non_hispanic_white_population,
+                    a.non_hispanic_white_pct,
+                    a.non_hispanic_black_population,
+                    a.non_hispanic_black_pct,
+                    a.non_hispanic_asian_population,
+                    a.non_hispanic_asian_pct,
+                    a.industry_total_workers,
+                    a.industry_agriculture_mining_workers,
+                    a.industry_construction_workers,
+                    a.industry_manufacturing_workers,
+                    a.industry_wholesale_trade_workers,
+                    a.industry_retail_trade_workers,
+                    a.industry_transportation_warehousing_utilities_workers,
+                    a.industry_information_workers,
+                    a.industry_finance_real_estate_workers,
+                    a.industry_professional_management_admin_workers,
+                    a.industry_education_health_social_workers,
+                    a.industry_arts_entertainment_food_workers,
+                    a.industry_other_services_workers,
+                    a.industry_public_administration_workers,
+                    a.primary_industry,
+                    a.primary_industry_workers,
+                    a.primary_industry_pct,
+                    t.geom::geometry(MultiPolygon, 4326) as geom
+                from {BOUNDARY_STAGE_TABLE} t
+                join dc_tracts d on d.geoid = t.geoid
+                left join {ACS_STAGE_TABLE} a on a.geoid = t.geoid
+                """
+            )
+            cur.execute(f"alter table {FINAL_TABLE} add primary key (geoid)")
+            cur.execute(
+                f"create index data_center_census_tracts_2024_geom_gix on {FINAL_TABLE} using gist (geom)"
+            )
+            cur.execute(
+                f"create index data_center_census_tracts_2024_state_county_idx on {FINAL_TABLE} (statefp, countyfp)"
+            )
+            cur.execute(
+                f"create index data_center_census_tracts_2024_dc_count_idx on {FINAL_TABLE} (data_center_count desc)"
+            )
+            cur.execute(
+                f"""
+                comment on table {FINAL_TABLE} is
+                'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
+                """
+            )
+            cur.execute(f"analyze {FINAL_TABLE}")
+
+
+def assign_point_geoids(conn):
+    with conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"alter table {POINT_TABLE} add column if not exists geoid text"
+            )
+            cur.execute(
+                f"""
+                update {POINT_TABLE} dc
+                set geoid = matched.geoid
+                from (
+                    select
+                        dc_inner.{POINT_ID_COL} as point_id,
+                        (
+                            select t.geoid
+                            from {BOUNDARY_STAGE_TABLE} t
+                            where t.geom && dc_inner.geom
+                              and st_covers(t.geom, dc_inner.geom)
+                            order by t.geoid
+                            limit 1
+                        ) as geoid
+                    from {POINT_TABLE} dc_inner
+                ) matched
+                where dc.{POINT_ID_COL} = matched.point_id
+                """
+            )
+            cur.execute(
+                f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
+            )
+            cur.execute(f"analyze {POINT_TABLE}")
+
+
+def validate(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            f"""
+            select
+                count(*)::integer as tract_rows,
+                coalesce(sum(data_center_count), 0)::integer as assigned_data_centers,
+                count(*) filter (where geom is not null)::integer as geom_rows
+            from {FINAL_TABLE}
+            """
+        )
+        summary = cur.fetchone()
+        cur.execute(f"select count(*)::integer from {POINT_TABLE}")
+        total_points = cur.fetchone()[0]
+        cur.execute(
+            f"""
+            select source, count(*)::integer
+            from {POINT_TABLE}
+            group by source
+            order by source
+            """
+        )
+        point_source_breakdown = cur.fetchall()
+        cur.execute(
+            f"""
+            select count(*)::integer
+            from {POINT_TABLE}
+            where geoid is null
+            """
+        )
+        unassigned_points = cur.fetchone()[0]
+        cur.execute(
+            f"""
+            select count(*)::integer
+            from {FINAL_TABLE}
+            where population is null
+            """
+        )
+        missing_acs = cur.fetchone()[0]
+    return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build census-tract enrichment table for data-center points."
+    )
+    parser.add_argument(
+        "--replace-final",
+        action="store_true",
+        help="Drop and rebuild the final tract table if it already exists.",
+    )
+    args = parser.parse_args()
+
+    conn = connect()
+    try:
+        state_fips = get_state_fips(conn)
+        if args.replace_final:
+            drop_final_table_if_exists(conn)
+        else:
+            ensure_final_table_absent(conn)
+    finally:
+        conn.close()
+
+    download_tract_boundaries()
+    import_tract_boundaries(state_fips)
+    acs_rows, acs_fieldnames = fetch_acs(state_fips)
+
+    conn = connect()
+    try:
+        if args.replace_final:
+            drop_final_table_if_exists(conn)
+        else:
+            ensure_final_table_absent(conn)
+        load_acs_stage(conn, acs_rows, acs_fieldnames)
+        create_final_table(conn)
+        assign_point_geoids(conn)
+        summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
+    finally:
+        conn.close()
+
+    print(f"loaded {len(acs_rows)} ACS tract rows into {ACS_STAGE_TABLE}")
+    print(f"created {FINAL_TABLE}")
+    print(
+        "tract_rows={0} assigned_data_centers={1} geom_rows={2} source_points={3}".format(
+            summary[0], summary[1], summary[2], total_points
+        )
+    )
+    print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
+    print(f"points_unassigned_to_tract={unassigned_points}")
+    print(f"tracts_missing_acs_population={missing_acs}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/ingest_eia_energy_layers.py
+++ b/scripts/ingest_eia_energy_layers.py
--- a/scripts/ingest_legiscan.py
+++ b/scripts/ingest_legiscan.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python3
+"""
+Ingest LegiScan legislative datasets for all US states (2016-2026) into PostgreSQL.
+
+Fetches all state session datasets from the LegiScan API, parses bill JSONs from
+each ZIP archive, and loads them into the data_centers PostgreSQL database. Bills are
+tagged with relevance categories (data_center, large_load, ratepayer_protection, etc.).
+
+Usage:
+    python ingest_legiscan.py [--all | --setup-db | --fetch | --load | --tag]
+                              [--state XX] [--year-start YYYY] [--dry-run] [--verbose]
+
+Options:
+    --all           Run all phases in sequence
+    --setup-db      Create/update database tables and indexes
+    --fetch         Download dataset ZIPs for all states (uses hash caching)
+    --load          Parse cached ZIPs and insert/update bills in DB
+    --tag           (Re)apply relevance tagging to all loaded bills
+    --state XX      Restrict to one state (e.g., CA)
+    --year-start N  Earliest session year to include (default: 2016)
+    --dry-run       Print what would be done; no API calls or DB writes
+    --verbose       Extra progress output
+
+Environment:
+    LEGISCAN_API_KEY                  Required
+    PGWEB_HOST, PGWEB_PORT,
+    PGWEB_USER, PGWEB_PASSWORD        PostgreSQL connection (DB: data_centers)
+"""
+
+import argparse
+import base64
+import io
+import json
+import logging
+import os
+import sys
+import time
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+DB_NAME = "data_centers"
+API_KEY = os.environ.get("LEGISCAN_API_KEY")
+API_BASE = "https://api.legiscan.com/"
+CACHE_DIR = Path("data/legiscan_cache")
+MIN_YEAR_DEFAULT = 2016
+RATE_LIMIT_DELAY = 0.5  # seconds between API calls
+
+# Keyword categories for relevance tagging.
+# Keys become the tag values stored in legiscan_bills.relevance_tags[].
+RELEVANCE_KEYWORDS: dict[str, list[str]] = {
+    "data_center": [
+        "data center", "data centre", "hyperscale", "colocation", "colo facility",
+        "server farm", "cloud computing facility", "internet exchange",
+        "carrier hotel", "artificial intelligence facility", "ai campus",
+        "ai data center", "gpu cluster", "compute facility",
+        "high performance computing", "hpc facility", "data hall",
+        "network access point", "data warehousing facility",
+    ],
+    "large_load": [
+        "large load", "large power consumer", "large electricity consumer",
+        "high electricity consumption", "high power consumption",
+        "megawatt load", "gigawatt load", "cryptocurrency mining",
+        "bitcoin mining", "blockchain mining", "crypto mining",
+        "digital asset mining", "proof of work", "electric arc furnace",
+        "large industrial customer", "high-density load", "new large load",
+        "load growth", "extraordinary load",
+    ],
+    "ratepayer_protection": [
+        "ratepayer", "rate payer", "cost shift", "cost shifting",
+        "cost allocation", "cross-subsidy", "cross subsidy",
+        "rate design", "rate structure", "electricity rate",
+        "electric rate", "utility rate", "rate increase", "rate burden",
+        "rate base", "stranded cost", "rate class", "customer protection",
+        "consumer protection", "electric customer", "residential customer",
+        "demand charge", "transmission cost", "grid upgrade cost",
+        "interconnection cost", "cost recovery", "rate relief",
+        "affordability", "energy burden",
+    ],
+    "grid_impact": [
+        "grid reliability", "grid stability", "grid congestion",
+        "grid modernization", "grid infrastructure", "electric grid",
+        "power grid", "electricity grid", "transmission upgrade",
+        "transmission expansion", "interconnection queue",
+        "interconnection study", "demand response", "curtailment",
+        "grid capacity", "system reliability", "capacity expansion",
+        "electric system", "power system reliability", "grid resilience",
+        "grid planning", "integrated resource plan",
+    ],
+    "water_use": [
+        "water consumption", "cooling water", "water efficiency",
+        "water use effectiveness", "evaporative cooling",
+        "water withdrawal", "water discharge", "water impact",
+        "water footprint", "cooling tower", "water-cooled",
+        "once-through cooling", "recycled water", "water stress",
+        "water scarcity",
+    ],
+    "tax_incentive": [
+        "tax credit", "tax exemption", "tax abatement", "tax incentive",
+        "sales tax exemption", "property tax exemption", "tax break",
+        "tax relief", "enterprise zone", "economic incentive",
+        "business incentive", "investment credit", "job creation credit",
+        "economic development incentive", "opportunity zone",
+        "tax subsidy",
+    ],
+    "energy_policy": [
+        "renewable energy", "clean energy", "energy efficiency",
+        "power purchase agreement", " ppa ", "green tariff",
+        "clean power", "carbon neutral", "net zero", "decarbonization",
+        "energy procurement", "24/7 clean energy", "carbon-free",
+        "clean electricity", "energy storage", "virtual power plant",
+        "net metering", "green power",
+    ],
+    "siting_permitting": [
+        "conditional use permit", "special use permit", "land use permit",
+        "zoning", "facility siting", "environmental review",
+        "environmental impact", "noise ordinance", "setback requirement",
+        "building permit", "construction permit", "site approval",
+        "local approval", "permit requirement", "permitting process",
+        "local control", "preemption",
+    ],
+}
+
+# Status code labels (LegiScan)
+STATUS_LABELS = {
+    0: "N/A", 1: "Introduced", 2: "Engrossed", 3: "Enrolled",
+    4: "Passed", 5: "Vetoed", 6: "Failed", 7: "Override",
+    8: "Chaptered", 9: "Referred", 10: "Report Pass",
+    11: "Report DNP", 12: "Draft",
+}
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s  %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Database
+# ---------------------------------------------------------------------------
+
+def get_db_connection():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+DDL = """
+CREATE TABLE IF NOT EXISTS legiscan_sessions (
+    session_id      INTEGER PRIMARY KEY,
+    state_id        INTEGER NOT NULL,
+    state_abbr      VARCHAR(2) NOT NULL,
+    year_start      INTEGER NOT NULL,
+    year_end        INTEGER NOT NULL,
+    session_title   TEXT,
+    session_tag     TEXT,
+    is_special      BOOLEAN DEFAULT FALSE,
+    is_prior        BOOLEAN DEFAULT FALSE,
+    dataset_hash    VARCHAR(32),
+    dataset_date    DATE,
+    dataset_size_mb FLOAT,
+    bill_count      INTEGER DEFAULT 0,
+    imported_at     TIMESTAMPTZ
+);
+
+CREATE TABLE IF NOT EXISTS legiscan_bills (
+    bill_id         INTEGER PRIMARY KEY,
+    session_id      INTEGER REFERENCES legiscan_sessions(session_id),
+    state           VARCHAR(2) NOT NULL,
+    bill_number     VARCHAR(50),
+    bill_type       VARCHAR(10),
+    title           TEXT,
+    description     TEXT,
+    status          INTEGER,
+    status_date     DATE,
+    completed       INTEGER DEFAULT 0,
+    body            VARCHAR(10),
+    url             TEXT,
+    state_link      TEXT,
+    change_hash     VARCHAR(32),
+    subjects        TEXT[],
+    sponsor_count   INTEGER DEFAULT 0,
+    vote_count      INTEGER DEFAULT 0,
+    text_count      INTEGER DEFAULT 0,
+    is_relevant     BOOLEAN DEFAULT FALSE,
+    relevance_tags  TEXT[],
+    imported_at     TIMESTAMPTZ DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_ls_bills_state        ON legiscan_bills(state);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_session      ON legiscan_bills(session_id);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_status       ON legiscan_bills(status);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_relevant     ON legiscan_bills(is_relevant) WHERE is_relevant;
+CREATE INDEX IF NOT EXISTS idx_ls_bills_subjects     ON legiscan_bills USING gin(subjects);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_rtags        ON legiscan_bills USING gin(relevance_tags);
+CREATE INDEX IF NOT EXISTS idx_ls_bills_fts ON legiscan_bills
+    USING gin(to_tsvector('english',
+        COALESCE(title, '') || ' ' || COALESCE(description, '')));
+"""
+
+
+def setup_db(conn):
+    with conn.cursor() as cur:
+        cur.execute(DDL)
+    conn.commit()
+    log.info("Database tables and indexes ready.")
+
+
+# ---------------------------------------------------------------------------
+# LegiScan API helpers
+# ---------------------------------------------------------------------------
+
+def _api_get(params: dict, timeout: int = 120) -> dict:
+    """Make one LegiScan API call and return the parsed JSON."""
+    params["key"] = API_KEY
+    resp = requests.get(API_BASE, params=params, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    if data.get("status") != "OK":
+        raise RuntimeError(f"LegiScan API error: {data}")
+    return data
+
+
+def get_all_dataset_metadata(year_start: int, state_filter: Optional[str] = None) -> list[dict]:
+    """Fetch full dataset list (one API call), filter to year_start+."""
+    log.info("Fetching dataset list from LegiScan…")
+    data = _api_get({"op": "getDatasetList"})
+    sessions = data["datasetlist"]
+    log.info(f"  Total sessions across all states: {len(sessions)}")
+    sessions = [s for s in sessions if s["year_start"] >= year_start]
+    if state_filter:
+        # Need to map state abbr → state_id.  Derive from a quick per-state call.
+        log.info(f"  Filtering to state {state_filter}…")
+        state_data = _api_get({"op": "getDatasetList", "state": state_filter})
+        valid_ids = {s["session_id"] for s in state_data["datasetlist"]}
+        sessions = [s for s in sessions if s["session_id"] in valid_ids]
+    log.info(f"  Sessions matching filters: {len(sessions)}")
+    return sessions
+
+
+def download_dataset_zip(session: dict, dry_run: bool = False) -> tuple[Optional[bytes], bool]:
+    """Download a dataset ZIP via the API; cache to disk.
+    Returns (zip_bytes, api_call_made) — api_call_made is True only when the
+    network was actually hit so the caller can rate-limit appropriately."""
+    session_id = session["session_id"]
+    dataset_hash = session["dataset_hash"]
+    access_key = session["access_key"]
+
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_path = CACHE_DIR / f"{session_id}_{dataset_hash}.zip"
+
+    if cache_path.exists():
+        log.debug(f"  Cache hit: {cache_path.name}")
+        return cache_path.read_bytes(), False
+
+    if dry_run:
+        log.info(f"  [dry-run] Would download session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)")
+        return None, False
+
+    log.info(f"  Downloading session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)…")
+    data = _api_get({"op": "getDataset", "access_key": access_key, "id": session_id})
+    zip_bytes = base64.b64decode(data["dataset"]["zip"])
+    cache_path.write_bytes(zip_bytes)
+    log.info(f"  Cached → {cache_path.name}")
+    return zip_bytes, True
+    return zip_bytes
+
+
+# ---------------------------------------------------------------------------
+# Relevance tagging
+# ---------------------------------------------------------------------------
+
+def score_relevance(title: str, description: str, subjects: list[str]) -> tuple[bool, list[str]]:
+    """Return (is_relevant, list_of_matched_tags)."""
+    haystack = " ".join([
+        (title or "").lower(),
+        (description or "").lower(),
+        " ".join(s.lower() for s in subjects),
+    ])
+    tags = []
+    for tag, keywords in RELEVANCE_KEYWORDS.items():
+        if any(kw in haystack for kw in keywords):
+            tags.append(tag)
+    return bool(tags), tags
+
+
+# ---------------------------------------------------------------------------
+# ZIP processing and DB loading
+# ---------------------------------------------------------------------------
+
+def _state_abbr_from_zip(zf: zipfile.ZipFile) -> str:
+    """Extract the state abbreviation from the ZIP's path structure."""
+    for name in zf.namelist():
+        parts = name.split("/")
+        if len(parts) >= 1 and len(parts[0]) == 2:
+            return parts[0]
+    return "??"
+
+
+def process_dataset(
+    session: dict,
+    zip_bytes: bytes,
+    conn,
+    state_abbr: Optional[str] = None,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> int:
+    """Parse all bill JSONs from a ZIP and upsert into legiscan_bills. Returns count."""
+    session_id = session["session_id"]
+
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+        if not state_abbr:
+            state_abbr = _state_abbr_from_zip(zf)
+        bill_files = [n for n in zf.namelist() if "/bill/" in n and n.endswith(".json")]
+
+        if not bill_files:
+            log.warning(f"  Session {session_id}: no bill files found in ZIP.")
+            return 0
+
+        rows = []
+        for fname in bill_files:
+            try:
+                raw = json.loads(zf.read(fname))
+                b = raw.get("bill", raw)
+            except Exception as e:
+                log.warning(f"  Could not parse {fname}: {e}")
+                continue
+
+            subjects = [s["subject_name"] for s in (b.get("subjects") or []) if s.get("subject_name")]
+            is_rel, tags = score_relevance(
+                b.get("title", ""),
+                b.get("description", ""),
+                subjects,
+            )
+
+            status_date = b.get("status_date") or None
+            rows.append((
+                b["bill_id"],
+                session_id,
+                b.get("state", state_abbr),
+                b.get("bill_number"),
+                b.get("bill_type"),
+                b.get("title"),
+                b.get("description"),
+                b.get("status"),
+                status_date,
+                b.get("completed", 0),
+                b.get("body"),
+                b.get("url"),
+                b.get("state_link"),
+                b.get("change_hash"),
+                subjects or None,
+                len(b.get("sponsors") or []),
+                len(b.get("votes") or []),
+                len(b.get("texts") or []),
+                is_rel,
+                tags or None,
+            ))
+
+    if dry_run:
+        log.info(f"  [dry-run] Session {session_id} ({state_abbr}): would insert/update {len(rows)} bills")
+        return len(rows)
+
+    UPSERT = """
+        INSERT INTO legiscan_bills (
+            bill_id, session_id, state, bill_number, bill_type,
+            title, description, status, status_date, completed,
+            body, url, state_link, change_hash, subjects,
+            sponsor_count, vote_count, text_count,
+            is_relevant, relevance_tags, imported_at
+        ) VALUES %s
+        ON CONFLICT (bill_id) DO UPDATE SET
+            change_hash    = EXCLUDED.change_hash,
+            status         = EXCLUDED.status,
+            status_date    = EXCLUDED.status_date,
+            completed      = EXCLUDED.completed,
+            subjects       = EXCLUDED.subjects,
+            sponsor_count  = EXCLUDED.sponsor_count,
+            vote_count     = EXCLUDED.vote_count,
+            text_count     = EXCLUDED.text_count,
+            is_relevant    = EXCLUDED.is_relevant,
+            relevance_tags = EXCLUDED.relevance_tags,
+            imported_at    = NOW()
+        WHERE legiscan_bills.change_hash IS DISTINCT FROM EXCLUDED.change_hash
+    """
+
+    template = "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())"
+
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_values(cur, UPSERT, rows, template=template, page_size=500)
+        count = cur.rowcount
+
+    # Update session bill_count
+    with conn.cursor() as cur:
+        cur.execute(
+            "UPDATE legiscan_sessions SET bill_count = %s, imported_at = NOW() WHERE session_id = %s",
+            (len(rows), session_id),
+        )
+    conn.commit()
+
+    if verbose:
+        relevant = sum(1 for r in rows if r[18])
+        log.info(f"  Session {session_id} ({state_abbr}): {len(rows)} bills, {relevant} relevant, {count} upserted")
+    return len(rows)
+
+
+def upsert_session(session: dict, state_abbr: str, conn, dry_run: bool = False):
+    """Insert or update a session record."""
+    if dry_run:
+        return
+    with conn.cursor() as cur:
+        cur.execute("""
+            INSERT INTO legiscan_sessions
+                (session_id, state_id, state_abbr, year_start, year_end,
+                 session_title, session_tag, is_special, is_prior,
+                 dataset_hash, dataset_date, dataset_size_mb)
+            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+            ON CONFLICT (session_id) DO UPDATE SET
+                dataset_hash   = EXCLUDED.dataset_hash,
+                dataset_date   = EXCLUDED.dataset_date,
+                dataset_size_mb = EXCLUDED.dataset_size_mb,
+                session_title  = EXCLUDED.session_title
+        """, (
+            session["session_id"],
+            session["state_id"],
+            state_abbr,
+            session["year_start"],
+            session["year_end"],
+            session.get("session_title"),
+            session.get("session_tag"),
+            bool(session.get("special")),
+            bool(session.get("prior")),
+            session.get("dataset_hash"),
+            session.get("dataset_date"),
+            session.get("dataset_size", 0) / 1e6,
+        ))
+    conn.commit()
+
+
+def needs_import(session: dict, conn) -> bool:
+    """Return True if this session's dataset_hash differs from what's in the DB."""
+    with conn.cursor() as cur:
+        cur.execute(
+            "SELECT dataset_hash FROM legiscan_sessions WHERE session_id = %s",
+            (session["session_id"],),
+        )
+        row = cur.fetchone()
+    if row is None:
+        return True
+    return row[0] != session["dataset_hash"]
+
+
+# ---------------------------------------------------------------------------
+# Retag phase
+# ---------------------------------------------------------------------------
+
+def retag_all_bills(conn, dry_run: bool = False, verbose: bool = False):
+    """Re-score relevance for every bill already in the DB."""
+    log.info("Re-tagging all bills…")
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute("SELECT bill_id, title, description, subjects FROM legiscan_bills")
+        rows = cur.fetchall()
+
+    log.info(f"  Scoring {len(rows)} bills…")
+    updates = []
+    for row in rows:
+        is_rel, tags = score_relevance(
+            row["title"] or "",
+            row["description"] or "",
+            row["subjects"] or [],
+        )
+        updates.append((is_rel, tags or None, row["bill_id"]))
+
+    if dry_run:
+        relevant = sum(1 for u in updates if u[0])
+        log.info(f"  [dry-run] Would tag {relevant}/{len(updates)} bills as relevant")
+        return
+
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_values(
+            cur,
+            "UPDATE legiscan_bills SET is_relevant = data.is_rel, relevance_tags = data.tags "
+            "FROM (VALUES %s) AS data(is_rel, tags, bill_id) "
+            "WHERE legiscan_bills.bill_id = data.bill_id::integer",
+            updates,
+            template="(%s, %s::text[], %s)",
+        )
+    conn.commit()
+
+    relevant = sum(1 for u in updates if u[0])
+    log.info(f"  Tagged {relevant}/{len(updates)} bills as relevant.")
+
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+
+def print_summary(conn):
+    queries = {
+        "Total sessions":  "SELECT COUNT(*) FROM legiscan_sessions",
+        "Total bills":     "SELECT COUNT(*) FROM legiscan_bills",
+        "Relevant bills":  "SELECT COUNT(*) FROM legiscan_bills WHERE is_relevant",
+        "States covered":  "SELECT COUNT(DISTINCT state) FROM legiscan_bills",
+    }
+    print("\n--- LegiScan ingestion summary ---")
+    with conn.cursor() as cur:
+        for label, sql in queries.items():
+            cur.execute(sql)
+            print(f"  {label}: {cur.fetchone()[0]:,}")
+
+    # Top relevance tags
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT tag, COUNT(*) AS n
+            FROM legiscan_bills, unnest(relevance_tags) AS tag
+            GROUP BY tag ORDER BY n DESC
+        """)
+        rows = cur.fetchall()
+    if rows:
+        print("\n  Relevant bills by tag:")
+        for tag, n in rows:
+            print(f"    {tag:<30} {n:>6,}")
+
+    # Top states for relevant bills
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT state, COUNT(*) AS n
+            FROM legiscan_bills WHERE is_relevant
+            GROUP BY state ORDER BY n DESC LIMIT 15
+        """)
+        rows = cur.fetchall()
+    if rows:
+        print("\n  Top states by relevant bill count:")
+        for state, n in rows:
+            print(f"    {state}  {n:>5,}")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def parse_args():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--all",       action="store_true", help="Run setup-db + fetch + load + tag")
+    p.add_argument("--setup-db",  action="store_true", help="Create/update DB tables")
+    p.add_argument("--fetch",     action="store_true", help="Download dataset ZIPs")
+    p.add_argument("--load",      action="store_true", help="Load cached ZIPs into DB")
+    p.add_argument("--tag",       action="store_true", help="Retag all bills for relevance")
+    p.add_argument("--state",     default=None, metavar="XX", help="Limit to one state")
+    p.add_argument("--year-start", type=int, default=MIN_YEAR_DEFAULT, dest="year_start")
+    p.add_argument("--dry-run",   action="store_true")
+    p.add_argument("--verbose",   action="store_true")
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+
+    if not API_KEY:
+        log.error("LEGISCAN_API_KEY is not set.")
+        sys.exit(1)
+
+    do_setup = args.all or args.setup_db
+    do_fetch = args.all or args.fetch
+    do_load  = args.all or args.load
+    do_tag   = args.all or args.tag
+
+    if not any([do_setup, do_fetch, do_load, do_tag]):
+        log.error("Specify at least one phase: --all, --setup-db, --fetch, --load, --tag")
+        sys.exit(1)
+
+    conn = None if args.dry_run else get_db_connection()
+
+    # ── Setup ──────────────────────────────────────────────────────────────
+    if do_setup:
+        if args.dry_run:
+            log.info("[dry-run] Would create legiscan_sessions and legiscan_bills tables.")
+        else:
+            setup_db(conn)
+
+    # ── Fetch + Load (interleaved per session for memory efficiency) ────────
+    if do_fetch or do_load:
+        sessions = get_all_dataset_metadata(args.year_start, state_filter=args.state)
+        total = len(sessions)
+        log.info(f"Processing {total} sessions (year_start ≥ {args.year_start})…")
+
+        total_bills = 0
+        skipped = 0
+
+        for i, session in enumerate(sessions, 1):
+            session_id = session["session_id"]
+            state_id   = session["state_id"]
+            year_start = session["year_start"]
+            title      = session.get("session_title", "")
+
+            # Check if import needed
+            if do_load and not args.dry_run and conn and not needs_import(session, conn):
+                log.debug(f"  [{i}/{total}] Session {session_id} ({title}) — hash unchanged, skipping.")
+                skipped += 1
+                continue
+
+            log.info(f"[{i}/{total}] Session {session_id}: {title}")
+
+            # Download
+            zip_bytes = None
+            if do_fetch:
+                try:
+                    zip_bytes, api_called = download_dataset_zip(session, dry_run=args.dry_run)
+                    if api_called:
+                        time.sleep(RATE_LIMIT_DELAY)
+                except Exception as e:
+                    log.error(f"  Download failed for session {session_id}: {e}")
+                    continue
+            elif do_load:
+                # Load from cache only
+                cache_path = CACHE_DIR / f"{session_id}_{session['dataset_hash']}.zip"
+                if not cache_path.exists():
+                    log.warning(f"  Cache miss for session {session_id} — run --fetch first.")
+                    continue
+                zip_bytes = cache_path.read_bytes()
+
+            # Derive state abbreviation from ZIP structure
+            state_abbr = args.state
+            if zip_bytes and not state_abbr:
+                try:
+                    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+                        state_abbr = _state_abbr_from_zip(zf)
+                except Exception:
+                    state_abbr = "??"
+
+            # Upsert session record
+            if do_load and not args.dry_run and conn and state_abbr:
+                upsert_session(session, state_abbr, conn, dry_run=args.dry_run)
+
+            # Load bills
+            if do_load and zip_bytes:
+                try:
+                    n = process_dataset(
+                        session, zip_bytes, conn,
+                        state_abbr=state_abbr,
+                        dry_run=args.dry_run,
+                        verbose=args.verbose,
+                    )
+                    total_bills += n
+                except Exception as e:
+                    log.error(f"  Load failed for session {session_id}: {e}")
+                    if conn:
+                        conn.rollback()
+
+        log.info(f"Fetch/load complete. Bills processed: {total_bills:,}. Skipped (up-to-date): {skipped}.")
+
+    # ── Tag ────────────────────────────────────────────────────────────────
+    if do_tag and not (do_fetch or do_load):
+        if args.dry_run or conn:
+            retag_all_bills(conn, dry_run=args.dry_run, verbose=args.verbose)
+
+    # ── Summary ────────────────────────────────────────────────────────────
+    if conn and not args.dry_run:
+        print_summary(conn)
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/load_postgis_data_centers.py
+++ b/scripts/load_postgis_data_centers.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import os
+from decimal import Decimal
+
+import psycopg2
+from psycopg2.extras import execute_values
+
+
+CSV_PATH = "US_DC_Sample_geocoded.csv"
+IM3_CSV_PATH = "new/IM3_Existing_DataCenters.csv"
+TABLE = "public.us_dc_sample_geocoded"
+DB_NAME = "data_centers"
+
+ALL_COLS = [
+    "id",
+    "provider",
+    "facility_name",
+    "url",
+    "provider_url",
+    "country",
+    "state",
+    "state_code",
+    "city",
+    "postal_code",
+    "street_address",
+    "address",
+    "source_address",
+    "phone",
+    "area_sqft",
+    "power_mw",
+    "nearest_airport_miles",
+    "has_bare_metal",
+    "has_iaas",
+    "has_internet_exchange",
+    "has_colocation",
+    "certifications",
+    "content_summary",
+    "path",
+    "longitude",
+    "latitude",
+    "geocode_source",
+    "geocode_precision",
+    "geocode_status",
+    "geocode_match_address",
+    "census_status",
+    "census_match_type",
+    "census_input_address",
+    "census_tiger_line_id",
+    "census_side",
+    "nominatim_display_name",
+    "nominatim_osm_type",
+    "nominatim_osm_id",
+]
+
+INT_COLS = {"area_sqft", "census_tiger_line_id", "nominatim_osm_id"}
+NUM_COLS = {"power_mw", "nearest_airport_miles", "longitude", "latitude"}
+BOOL_COLS = {
+    "has_bare_metal",
+    "has_iaas",
+    "has_internet_exchange",
+    "has_colocation",
+}
+
+
+def to_int(value):
+    if value in (None, ""):
+        return None
+    return int(Decimal(value))
+
+
+def to_decimal(value):
+    return Decimal(value) if value not in (None, "") else None
+
+
+def to_bool(value):
+    return bool(int(value)) if value not in (None, "") else None
+
+
+def convert(row, column):
+    value = row.get(column)
+    if column in INT_COLS:
+        return to_int(value)
+    if column in NUM_COLS:
+        return to_decimal(value)
+    if column in BOOL_COLS:
+        return to_bool(value)
+    return None if value == "" else value
+
+
+def normalize_geocoded_row(row):
+    return {column: row.get(column, "") for column in ALL_COLS}
+
+
+def normalize_im3_row(row):
+    return {
+        "id": row.get("id", ""),
+        "provider": row.get("operator", ""),
+        "facility_name": row.get("name", ""),
+        "url": "",
+        "provider_url": "",
+        "country": "United States",
+        "state": row.get("state", ""),
+        "state_code": row.get("state_abb", ""),
+        "city": "",
+        "postal_code": "",
+        "street_address": "",
+        "address": "",
+        "source_address": "",
+        "phone": "",
+        "area_sqft": row.get("sqft", ""),
+        "power_mw": "",
+        "nearest_airport_miles": "",
+        "has_bare_metal": "",
+        "has_iaas": "",
+        "has_internet_exchange": "",
+        "has_colocation": "",
+        "certifications": "",
+        "content_summary": "",
+        "path": "IM3_Existing_DataCenters.csv",
+        "longitude": row.get("lon", ""),
+        "latitude": row.get("lat", ""),
+        "geocode_source": "IM3_Existing_DataCenters",
+        "geocode_precision": row.get("type", "") or "im3",
+        "geocode_status": "im3_imported",
+        "geocode_match_address": "",
+        "census_status": "",
+        "census_match_type": "",
+        "census_input_address": "",
+        "census_tiger_line_id": "",
+        "census_side": "",
+        "nominatim_display_name": "",
+        "nominatim_osm_type": "",
+        "nominatim_osm_id": "",
+    }
+
+
+def read_and_normalize_rows(csv_path, source):
+    with open(csv_path, newline="", encoding="utf-8") as csv_file:
+        rows = list(csv.DictReader(csv_file))
+
+    if source == "im3":
+        normalized = [normalize_im3_row(row) for row in rows]
+    else:
+        normalized = [normalize_geocoded_row(row) for row in rows]
+
+    deduped = {}
+    for row in normalized:
+        row_id = (row.get("id") or "").strip()
+        if not row_id:
+            continue
+        deduped[row_id] = row
+
+    values = [tuple(convert(row, column) for column in ALL_COLS) for row in deduped.values()]
+    return rows, values
+
+
+def create_table(cur):
+    cur.execute(
+        f"""
+        create table {TABLE} (
+            id text primary key,
+            provider text,
+            facility_name text,
+            url text,
+            provider_url text,
+            country text,
+            state text,
+            state_code text,
+            city text,
+            postal_code text,
+            street_address text,
+            address text,
+            source_address text,
+            phone text,
+            area_sqft integer,
+            power_mw numeric,
+            nearest_airport_miles numeric,
+            has_bare_metal boolean,
+            has_iaas boolean,
+            has_internet_exchange boolean,
+            has_colocation boolean,
+            certifications text,
+            content_summary text,
+            path text,
+            longitude double precision not null,
+            latitude double precision not null,
+            geocode_source text,
+            geocode_precision text,
+            geocode_status text,
+            geocode_match_address text,
+            census_status text,
+            census_match_type text,
+            census_input_address text,
+            census_tiger_line_id bigint,
+            census_side text,
+            nominatim_display_name text,
+            nominatim_osm_type text,
+            nominatim_osm_id bigint,
+            geom geometry(Point, 4326) generated always as
+                (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
+        )
+        """
+    )
+
+
+def insert_values(cur, values, upsert):
+    insert_sql = f"insert into {TABLE} ({', '.join(ALL_COLS)}) values %s"
+    if upsert:
+        update_cols = [col for col in ALL_COLS if col != "id"]
+        assignments = ", ".join(f"{col} = excluded.{col}" for col in update_cols)
+        insert_sql += f" on conflict (id) do update set {assignments}"
+    execute_values(cur, insert_sql, values, page_size=100)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Load data-center CSV data into public.us_dc_sample_geocoded."
+    )
+    parser.add_argument(
+        "--source",
+        choices=["geocoded", "im3"],
+        default="geocoded",
+        help="Input schema type. Use 'im3' for new/IM3_Existing_DataCenters.csv.",
+    )
+    parser.add_argument(
+        "--csv-path",
+        help="Override input CSV path. If omitted, uses a source-specific default.",
+    )
+    parser.add_argument(
+        "--append",
+        action="store_true",
+        help="Append/upsert into an existing target table instead of creating a new one.",
+    )
+    parser.add_argument(
+        "--upsert",
+        action="store_true",
+        help="On id conflicts, update the existing row. Recommended with --append.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    default_csv = IM3_CSV_PATH if args.source == "im3" else CSV_PATH
+    csv_path = args.csv_path or default_csv
+    rows, values = read_and_normalize_rows(csv_path, args.source)
+
+    conn = psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+    try:
+        with conn:
+            with conn.cursor() as cur:
+                cur.execute("create extension if not exists postgis")
+                cur.execute("select to_regclass(%s)", (TABLE,))
+                table_exists = cur.fetchone()[0] is not None
+
+                if not table_exists:
+                    create_table(cur)
+                    cur.execute(
+                        f"create index us_dc_sample_geocoded_geom_gix on {TABLE} using gist (geom)"
+                    )
+                    cur.execute(
+                        f"create index us_dc_sample_geocoded_state_city_idx on {TABLE} (state_code, city)"
+                    )
+                elif not args.append:
+                    raise RuntimeError(
+                        f"Target table {TABLE} already exists; use --append to add data."
+                    )
+
+                insert_values(cur, values, upsert=args.upsert)
+                cur.execute(f"analyze {TABLE}")
+    finally:
+        conn.close()
+
+    source_label = "IM3-adapted" if args.source == "im3" else "geocoded"
+    mode = "append" if args.append else "create"
+    conflict_mode = "upsert" if args.upsert else "insert"
+    print(
+        f"loaded {len(values)} {source_label} rows into {TABLE} "
+        f"(mode={mode}, conflict={conflict_mode}, csv={csv_path})"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/load_postgis_internet_cables.py
+++ b/scripts/load_postgis_internet_cables.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+"""Load internet_cables/*.json into PostGIS.
+
+Reads:
+  - internet_cables/all_cables.json          -> public.internet_cables (+ landing points)
+  - internet_cables/city_dominance_2026.json -> public.internet_city_dominance
+  - internet_cables/year-summaries.json      -> public.internet_cable_year_summaries
+  - internet_cables/meta.json                -> public.internet_cable_meta
+
+Requires env vars: PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD.
+"""
+import argparse
+import json
+import os
+import re
+from decimal import Decimal
+
+import psycopg2
+from psycopg2.extras import Json, execute_values
+
+
+DATA_DIR = "internet_cables"
+DB_NAME = "data_centers"
+
+CABLES_TABLE = "public.internet_cables"
+LANDINGS_TABLE = "public.internet_cable_landing_points"
+CITY_TABLE = "public.internet_city_dominance"
+YEAR_TABLE = "public.internet_cable_year_summaries"
+META_TABLE = "public.internet_cable_meta"
+
+LENGTH_KM_RE = re.compile(r"([\d,\.]+)\s*km", re.IGNORECASE)
+
+
+def parse_length_km(raw):
+    if not raw:
+        return None
+    match = LENGTH_KM_RE.search(raw)
+    if not match:
+        return None
+    try:
+        return Decimal(match.group(1).replace(",", ""))
+    except Exception:
+        return None
+
+
+def to_int(value):
+    if value in (None, ""):
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def to_bool(value):
+    if value is None:
+        return None
+    return bool(value)
+
+
+def linestring_to_wkt(coords):
+    return "(" + ", ".join(f"{lon} {lat}" for lon, lat in coords) + ")"
+
+
+def feature_to_multilinestring_wkt(geometry):
+    gtype = geometry.get("type")
+    coords = geometry.get("coordinates") or []
+    if gtype == "MultiLineString":
+        parts = [linestring_to_wkt(line) for line in coords if len(line) >= 2]
+    elif gtype == "LineString":
+        parts = [linestring_to_wkt(coords)] if len(coords) >= 2 else []
+    else:
+        return None
+    if not parts:
+        return None
+    return "MULTILINESTRING(" + ", ".join(parts) + ")"
+
+
+def create_cable_tables(cur):
+    cur.execute(
+        f"""
+        create table {CABLES_TABLE} (
+            feature_id text primary key,
+            cable_id text,
+            name text,
+            color text,
+            owners text,
+            rfs_year integer,
+            decommission_year integer,
+            length_raw text,
+            length_km numeric,
+            cable_type text,
+            url text,
+            extra_urls jsonb,
+            properties jsonb,
+            geom geometry(MultiLineString, 4326)
+        )
+        """
+    )
+    cur.execute(
+        f"create index internet_cables_geom_gix on {CABLES_TABLE} using gist (geom)"
+    )
+    cur.execute(
+        f"create index internet_cables_cable_id_idx on {CABLES_TABLE} (cable_id)"
+    )
+    cur.execute(
+        f"create index internet_cables_rfs_year_idx on {CABLES_TABLE} (rfs_year)"
+    )
+
+    cur.execute(
+        f"""
+        create table {LANDINGS_TABLE} (
+            feature_id text references {CABLES_TABLE}(feature_id) on delete cascade,
+            ordinal integer,
+            landing_id text,
+            name text,
+            country text,
+            is_tbd boolean,
+            primary key (feature_id, ordinal)
+        )
+        """
+    )
+    cur.execute(
+        f"create index internet_cable_landings_landing_id_idx on {LANDINGS_TABLE} (landing_id)"
+    )
+    cur.execute(
+        f"create index internet_cable_landings_country_idx on {LANDINGS_TABLE} (country)"
+    )
+
+
+def create_city_table(cur):
+    cur.execute(
+        f"""
+        create table {CITY_TABLE} (
+            id text primary key,
+            city text,
+            country text,
+            country_name text,
+            region text,
+            status text,
+            physical_capacity_tbps numeric,
+            added_physical_capacity_tbps numeric,
+            logical_dominance_ips bigint,
+            top_asns jsonb,
+            longitude double precision,
+            latitude double precision,
+            geom geometry(Point, 4326) generated always as
+                (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
+        )
+        """
+    )
+    cur.execute(
+        f"create index internet_city_dominance_geom_gix on {CITY_TABLE} using gist (geom)"
+    )
+    cur.execute(
+        f"create index internet_city_dominance_country_idx on {CITY_TABLE} (country)"
+    )
+
+
+def create_year_table(cur):
+    cur.execute(
+        f"""
+        create table {YEAR_TABLE} (
+            year integer primary key,
+            description text
+        )
+        """
+    )
+
+
+def create_meta_table(cur):
+    cur.execute(
+        f"""
+        create table {META_TABLE} (
+            key text primary key,
+            value text
+        )
+        """
+    )
+
+
+def load_cables(cur, path):
+    with open(path, encoding="utf-8") as fh:
+        features = json.load(fh)
+
+    cable_rows = []
+    landing_rows = []
+    used_feature_ids = set()
+
+    for idx, feature in enumerate(features):
+        props = feature.get("properties") or {}
+        feature_id = props.get("feature_id") or props.get("id")
+        if not feature_id:
+            feature_id = f"legacy-{idx}"
+        # Disambiguate any residual collisions
+        base = feature_id
+        suffix = 1
+        while feature_id in used_feature_ids:
+            feature_id = f"{base}-{suffix}"
+            suffix += 1
+        used_feature_ids.add(feature_id)
+
+        # length may also live in a top-level lengthKm field on legacy entries
+        length_raw = props.get("length")
+        length_km = parse_length_km(length_raw)
+        if length_km is None and feature.get("lengthKm") is not None:
+            try:
+                length_km = Decimal(str(feature["lengthKm"]))
+            except Exception:
+                pass
+
+        wkt = feature_to_multilinestring_wkt(feature.get("geometry") or {})
+        cable_rows.append(
+            (
+                feature_id,
+                props.get("id"),
+                props.get("name"),
+                props.get("color"),
+                props.get("owners"),
+                to_int(props.get("rfs_year")),
+                to_int(props.get("decommission_year")),
+                length_raw,
+                length_km,
+                props.get("type"),
+                props.get("url"),
+                Json(props.get("extraUrls") or []),
+                Json(props),
+                wkt,
+            )
+        )
+
+        for ordinal, lp in enumerate(props.get("landing_points") or []):
+            landing_rows.append(
+                (
+                    feature_id,
+                    ordinal,
+                    lp.get("id") or None,
+                    lp.get("name"),
+                    lp.get("country"),
+                    to_bool(lp.get("is_tbd")),
+                )
+            )
+
+    execute_values(
+        cur,
+        f"""
+        insert into {CABLES_TABLE} (
+            feature_id, cable_id, name, color, owners, rfs_year, decommission_year,
+            length_raw, length_km, cable_type, url, extra_urls, properties, geom
+        ) values %s
+        """,
+        cable_rows,
+        template=(
+            "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
+            "ST_GeomFromText(%s, 4326))"
+        ),
+        page_size=200,
+    )
+
+    execute_values(
+        cur,
+        f"""
+        insert into {LANDINGS_TABLE} (feature_id, ordinal, landing_id, name, country, is_tbd)
+        values %s
+        """,
+        landing_rows,
+        page_size=500,
+    )
+
+    return len(cable_rows), len(landing_rows)
+
+
+def load_city_dominance(cur, path):
+    with open(path, encoding="utf-8") as fh:
+        items = json.load(fh)
+
+    rows = []
+    seen = set()
+    for item in items:
+        item_id = item.get("id")
+        if not item_id or item_id in seen:
+            continue
+        seen.add(item_id)
+        coords = item.get("coordinates") or [None, None]
+        lon, lat = (coords + [None, None])[:2]
+        rows.append(
+            (
+                item_id,
+                item.get("city"),
+                item.get("country"),
+                item.get("country_name"),
+                item.get("region"),
+                item.get("status"),
+                item.get("physical_capacity_tbps"),
+                item.get("added_physical_capacity_tbps"),
+                item.get("logical_dominance_ips"),
+                Json(item.get("top_asns") or []),
+                lon,
+                lat,
+            )
+        )
+
+    execute_values(
+        cur,
+        f"""
+        insert into {CITY_TABLE} (
+            id, city, country, country_name, region, status,
+            physical_capacity_tbps, added_physical_capacity_tbps,
+            logical_dominance_ips, top_asns, longitude, latitude
+        ) values %s
+        """,
+        rows,
+        page_size=500,
+    )
+    return len(rows)
+
+
+def load_year_summaries(cur, path):
+    with open(path, encoding="utf-8") as fh:
+        data = json.load(fh)
+    rows = []
+    for year_key, value in data.items():
+        year = to_int(year_key)
+        if year is None:
+            continue
+        description = value.get("description") if isinstance(value, dict) else str(value)
+        rows.append((year, description))
+    execute_values(
+        cur,
+        f"insert into {YEAR_TABLE} (year, description) values %s",
+        rows,
+        page_size=200,
+    )
+    return len(rows)
+
+
+def load_meta(cur, path):
+    with open(path, encoding="utf-8") as fh:
+        data = json.load(fh)
+    rows = [(str(k), str(v)) for k, v in data.items()]
+    execute_values(
+        cur,
+        f"insert into {META_TABLE} (key, value) values %s",
+        rows,
+    )
+    return len(rows)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Load internet_cables/*.json into PostGIS."
+    )
+    parser.add_argument(
+        "--data-dir",
+        default=DATA_DIR,
+        help=f"Directory containing the JSON files (default: {DATA_DIR})",
+    )
+    parser.add_argument(
+        "--replace",
+        action="store_true",
+        help="Drop existing target tables before loading.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    cables_path = os.path.join(args.data_dir, "all_cables.json")
+    city_path = os.path.join(args.data_dir, "city_dominance_2026.json")
+    year_path = os.path.join(args.data_dir, "year-summaries.json")
+    meta_path = os.path.join(args.data_dir, "meta.json")
+
+    for path in [cables_path, city_path, year_path, meta_path]:
+        if not os.path.exists(path):
+            raise FileNotFoundError(path)
+
+    conn = psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+    try:
+        with conn:
+            with conn.cursor() as cur:
+                cur.execute("create extension if not exists postgis")
+
+                if args.replace:
+                    cur.execute(
+                        f"drop table if exists {LANDINGS_TABLE}, {CABLES_TABLE}, "
+                        f"{CITY_TABLE}, {YEAR_TABLE}, {META_TABLE} cascade"
+                    )
+
+                for table, creator in [
+                    (CABLES_TABLE, lambda c: create_cable_tables(c)),
+                    (CITY_TABLE, create_city_table),
+                    (YEAR_TABLE, create_year_table),
+                    (META_TABLE, create_meta_table),
+                ]:
+                    cur.execute("select to_regclass(%s)", (table,))
+                    if cur.fetchone()[0] is not None:
+                        raise RuntimeError(
+                            f"Target table {table} already exists; rerun with --replace to overwrite."
+                        )
+                    creator(cur)
+
+                cable_count, landing_count = load_cables(cur, cables_path)
+                city_count = load_city_dominance(cur, city_path)
+                year_count = load_year_summaries(cur, year_path)
+                meta_count = load_meta(cur, meta_path)
+
+                for table in [CABLES_TABLE, LANDINGS_TABLE, CITY_TABLE, YEAR_TABLE, META_TABLE]:
+                    cur.execute(f"analyze {table}")
+    finally:
+        conn.close()
+
+    print(
+        f"loaded {cable_count} cables, {landing_count} landing points, "
+        f"{city_count} city-dominance points, {year_count} year summaries, "
+        f"{meta_count} meta rows."
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/load_postgis_osm_data_centers.py
+++ b/scripts/load_postgis_osm_data_centers.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+Fetch US data centers from OpenStreetMap (Overpass API) and load them into
+public.osm_data_centers in the data_centers database. Also (re)creates a
+unioned view public.data_centers_union combining OSM + curated rows from
+public.us_dc_sample_geocoded.
+
+Two Overpass passes are made because tagging is inconsistent:
+  1) telecom=data_center
+  2) building=data_center
+
+Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
+in match_tags so we can see which query found each feature.
+"""
+import argparse
+import json
+import os
+import sys
+import time
+from typing import Dict, List, Optional, Tuple
+
+import psycopg2
+import requests
+from psycopg2.extras import Json, execute_values
+
+OVERPASS_URL = "https://overpass-api.de/api/interpreter"
+TABLE = "public.osm_data_centers"
+VIEW = "public.data_centers_union"
+CURATED_TABLE = "public.us_dc_sample_geocoded"
+DB_NAME = "data_centers"
+
+# Tag passes: (key, value)
+TAG_PASSES = [
+    ("telecom", "data_center"),
+    ("building", "data_center"),
+]
+
+
+def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
+    return f"""
+[out:json][timeout:{timeout}];
+area["ISO3166-1"="US"][admin_level=2]->.us;
+(
+  node["{tag_key}"="{tag_value}"](area.us);
+  way["{tag_key}"="{tag_value}"](area.us);
+  relation["{tag_key}"="{tag_value}"](area.us);
+);
+out center tags;
+""".strip()
+
+
+def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
+    if cache_path and os.path.exists(cache_path):
+        print(f"  using cached response: {cache_path}")
+        with open(cache_path, "r", encoding="utf-8") as fh:
+            payload = json.load(fh)
+    else:
+        query = overpass_query(tag_key, tag_value)
+        print(f"  querying Overpass for {tag_key}={tag_value} ...")
+        headers = {
+            "User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
+            "Accept": "application/json",
+        }
+        resp = requests.post(
+            OVERPASS_URL,
+            data={"data": query},
+            headers=headers,
+            timeout=240,
+        )
+        if resp.status_code != 200:
+            print(f"  Overpass returned {resp.status_code}: {resp.text[:500]}")
+        resp.raise_for_status()
+        payload = resp.json()
+        if cache_path:
+            with open(cache_path, "w", encoding="utf-8") as fh:
+                json.dump(payload, fh)
+            print(f"  cached to {cache_path}")
+    elements = payload.get("elements", [])
+    print(f"  pass returned {len(elements)} elements")
+    return elements
+
+
+def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
+    if elem.get("type") == "node":
+        return elem.get("lon"), elem.get("lat")
+    center = elem.get("center") or {}
+    return center.get("lon"), center.get("lat")
+
+
+def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
+    lon, lat = element_coords(elem)
+    if lon is None or lat is None:
+        return None
+    osm_type = elem.get("type")
+    osm_id = elem.get("id")
+    if osm_type is None or osm_id is None:
+        return None
+    tags = elem.get("tags") or {}
+    return {
+        "id": f"{osm_type}/{osm_id}",
+        "osm_type": osm_type,
+        "osm_id": int(osm_id),
+        "name": tags.get("name"),
+        "operator": tags.get("operator"),
+        "operator_type": tags.get("operator:type"),
+        "telecom": tags.get("telecom"),
+        "building": tags.get("building"),
+        "power": tags.get("power"),
+        "website": tags.get("website") or tags.get("contact:website"),
+        "phone": tags.get("phone") or tags.get("contact:phone"),
+        "street_address": " ".join(
+            part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
+        ) or None,
+        "city": tags.get("addr:city"),
+        "state": tags.get("addr:state"),
+        "postal_code": tags.get("addr:postcode"),
+        "country": tags.get("addr:country") or "US",
+        "matched_tags": [matched_tag],
+        "tags": tags,
+        "longitude": float(lon),
+        "latitude": float(lat),
+    }
+
+
+def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
+    for row in new_rows:
+        key = row["id"]
+        prior = existing.get(key)
+        if prior is None:
+            existing[key] = row
+            continue
+        # merge matched_tags; keep first non-null values for other fields
+        merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
+        prior["matched_tags"] = merged_tags
+        for col, val in row.items():
+            if col == "matched_tags":
+                continue
+            if prior.get(col) in (None, "") and val not in (None, ""):
+                prior[col] = val
+
+
+COLUMNS = [
+    "id",
+    "osm_type",
+    "osm_id",
+    "name",
+    "operator",
+    "operator_type",
+    "telecom",
+    "building",
+    "power",
+    "website",
+    "phone",
+    "street_address",
+    "city",
+    "state",
+    "postal_code",
+    "country",
+    "matched_tags",
+    "tags",
+    "longitude",
+    "latitude",
+]
+
+
+def row_to_tuple(row: dict) -> tuple:
+    return (
+        row["id"],
+        row["osm_type"],
+        row["osm_id"],
+        row.get("name"),
+        row.get("operator"),
+        row.get("operator_type"),
+        row.get("telecom"),
+        row.get("building"),
+        row.get("power"),
+        row.get("website"),
+        row.get("phone"),
+        row.get("street_address"),
+        row.get("city"),
+        row.get("state"),
+        row.get("postal_code"),
+        row.get("country"),
+        row.get("matched_tags", []),
+        Json(row.get("tags", {})),
+        row["longitude"],
+        row["latitude"],
+    )
+
+
+def create_table(cur) -> None:
+    cur.execute(
+        f"""
+        create table {TABLE} (
+            id text primary key,
+            osm_type text not null,
+            osm_id bigint not null,
+            name text,
+            operator text,
+            operator_type text,
+            telecom text,
+            building text,
+            power text,
+            website text,
+            phone text,
+            street_address text,
+            city text,
+            state text,
+            postal_code text,
+            country text,
+            matched_tags text[] not null default '{{}}',
+            tags jsonb not null default '{{}}'::jsonb,
+            longitude double precision not null,
+            latitude double precision not null,
+            ingested_at timestamptz not null default now(),
+            geom geometry(Point, 4326) generated always as
+                (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
+        )
+        """
+    )
+    cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
+    cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
+    cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
+
+
+def insert_values(cur, rows: List[dict], upsert: bool) -> None:
+    sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
+    if upsert:
+        update_cols = [c for c in COLUMNS if c != "id"]
+        assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
+        sql += (
+            f" on conflict (id) do update set {assignments}, "
+            f"ingested_at = now()"
+        )
+    execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
+
+
+def create_or_replace_view(cur) -> None:
+    cur.execute(
+        f"""
+        create or replace view {VIEW} as
+        select
+            'curated/' || id as id,
+            'curated'::text as source,
+            facility_name as name,
+            provider as operator,
+            street_address,
+            city,
+            state_code as state,
+            postal_code,
+            country,
+            url as website,
+            phone,
+            longitude,
+            latitude,
+            geom
+        from {CURATED_TABLE}
+        union all
+        select
+            id,
+            'osm'::text as source,
+            name,
+            operator,
+            street_address,
+            city,
+            state,
+            postal_code,
+            country,
+            website,
+            phone,
+            longitude,
+            latitude,
+            geom
+        from {TABLE}
+        """
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--cache-dir",
+        default="output",
+        help="Directory to cache raw Overpass responses (default: output/).",
+    )
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Do not read or write Overpass cache files; always hit the API.",
+    )
+    parser.add_argument(
+        "--recreate",
+        action="store_true",
+        help=f"Drop and recreate {TABLE} before loading.",
+    )
+    parser.add_argument(
+        "--upsert",
+        action="store_true",
+        default=True,
+        help="On id conflicts, update the existing row (default: on).",
+    )
+    parser.add_argument(
+        "--skip-view",
+        action="store_true",
+        help=f"Do not create/replace the unioned view {VIEW}.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    os.makedirs(args.cache_dir, exist_ok=True)
+    merged: Dict[str, dict] = {}
+    for tag_key, tag_value in TAG_PASSES:
+        cache_path = (
+            None
+            if args.no_cache
+            else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
+        )
+        print(f"Pass: {tag_key}={tag_value}")
+        elements = fetch_pass(tag_key, tag_value, cache_path)
+        normalized = [
+            row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
+            if row is not None
+        ]
+        print(f"  normalized {len(normalized)} rows with coords")
+        merge_records(merged, normalized)
+        # be polite to Overpass between passes
+        time.sleep(2)
+
+    rows = list(merged.values())
+    print(f"Total deduped OSM data-center features: {len(rows)}")
+    if not rows:
+        print("No rows fetched; aborting DB load.", file=sys.stderr)
+        return 1
+
+    conn = psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+    try:
+        with conn:
+            with conn.cursor() as cur:
+                cur.execute("create extension if not exists postgis")
+                if args.recreate:
+                    cur.execute(f"drop table if exists {TABLE} cascade")
+                cur.execute("select to_regclass(%s)", (TABLE,))
+                if cur.fetchone()[0] is None:
+                    create_table(cur)
+                insert_values(cur, rows, upsert=args.upsert)
+                cur.execute(f"analyze {TABLE}")
+                if not args.skip_view:
+                    cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
+                    if cur.fetchone()[0] is not None:
+                        create_or_replace_view(cur)
+                        print(f"View {VIEW} (re)created.")
+                    else:
+                        print(
+                            f"Skipping view: {CURATED_TABLE} does not exist.",
+                            file=sys.stderr,
+                        )
+                cur.execute(f"select count(*) from {TABLE}")
+                total = cur.fetchone()[0]
+    finally:
+        conn.close()
+
+    print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/make_data_center_map.py
+++ b/scripts/make_data_center_map.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from collections import Counter
+
+import psycopg2
+
+
+DB_NAME = "data_centers"
+POINT_TABLE = "public.master_data_centers"
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+def load_points(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            f"""
+            select
+                master_id,
+                source,
+                coalesce(operator, '') as operator,
+                coalesce(name, '') as name,
+                coalesce(city, '') as city,
+                coalesce(state, '') as state,
+                longitude,
+                latitude,
+                coalesce(curated_id, '') as curated_id,
+                coalesce(osm_id, '') as osm_id,
+                coalesce(match_method, '') as match_method,
+                coalesce(geoid, '') as geoid
+            from {POINT_TABLE}
+            where longitude is not null and latitude is not null
+            """
+        )
+        rows = cur.fetchall()
+
+    points = []
+    for row in rows:
+        points.append(
+            {
+                "id": row[0],
+                "source": row[1],
+                "operator": row[2],
+                "name": row[3],
+                "city": row[4],
+                "state": row[5],
+                "lon": float(row[6]),
+                "lat": float(row[7]),
+                "curated_id": row[8],
+                "osm_id": row[9],
+                "match_method": row[10],
+                "geoid": row[11],
+            }
+        )
+    return points
+
+
+def compute_center(points):
+    if not points:
+        return 39.5, -98.35
+    lat = sum(p["lat"] for p in points) / len(points)
+    lon = sum(p["lon"] for p in points) / len(points)
+    return lat, lon
+
+
+def build_stats(points):
+    by_source = Counter(p["source"] or "(blank)" for p in points)
+    by_match = Counter(p["match_method"] or "(none)" for p in points)
+    return {
+        "total": len(points),
+        "by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
+        "by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
+    }
+
+
+def render_html(points, center_lat, center_lon, output_path):
+    stats = build_stats(points)
+    points_json = json.dumps(points)
+    stats_json = json.dumps(stats)
+
+    html = f"""<!doctype html>
+<html lang=\"en\">
+<head>
+  <meta charset=\"utf-8\" />
+  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
+  <title>US Data Centers Master Map</title>
+  <link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
+  <style>
+    html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
+    #layout {{ display: grid; grid-template-columns: 320px 1fr; height: 100%; }}
+    #panel {{ padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; }}
+    #map {{ height: 100%; width: 100%; }}
+    h1 {{ margin: 0 0 8px; font-size: 18px; }}
+    h2 {{ margin: 16px 0 8px; font-size: 14px; }}
+    .stat-row {{ display: flex; justify-content: space-between; padding: 2px 0; font-size: 13px; }}
+    .dot {{ width: 10px; height: 10px; border-radius: 50%; display: inline-block; margin-right: 8px; }}
+    @media (max-width: 900px) {{
+      #layout {{ grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }}
+      #panel {{ border-right: 0; border-bottom: 1px solid #ddd; }}
+    }}
+  </style>
+</head>
+<body>
+  <div id=\"layout\">
+    <div id=\"panel\">
+      <h1>US Data Centers (Master)</h1>
+      <div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
+      <h2>Source</h2>
+      <div id=\"sourceStats\"></div>
+      <h2>Match Method (merged rows)</h2>
+      <div id=\"matchStats\"></div>
+      <h2>Source Colors</h2>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
+    </div>
+    <div id=\"map\"></div>
+  </div>
+
+  <script src=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.js\"></script>
+  <script>
+    const points = {points_json};
+    const stats = {stats_json};
+
+    function colorForSource(source) {{
+      if (source === 'merged') return '#2ca02c';
+      if (source === 'curated') return '#1f77b4';
+      if (source === 'osm') return '#ff7f0e';
+      return '#7f7f7f';
+    }}
+
+    function escapeHtml(value) {{
+      return String(value || '')
+        .replaceAll('&', '&amp;')
+        .replaceAll('<', '&lt;')
+        .replaceAll('>', '&gt;')
+        .replaceAll('"', '&quot;')
+        .replaceAll("'", '&#39;');
+    }}
+
+    const map = L.map('map', {{ preferCanvas: true }}).setView([{center_lat}, {center_lon}], 5);
+
+    L.tileLayer('https://tile.openstreetmap.org/{{z}}/{{x}}/{{y}}.png', {{
+      maxZoom: 19,
+      attribution: '&copy; OpenStreetMap contributors'
+    }}).addTo(map);
+
+    const bounds = [];
+    for (const p of points) {{
+      const marker = L.circleMarker([p.lat, p.lon], {{
+        radius: 4,
+        color: colorForSource(p.source),
+        fillColor: colorForSource(p.source),
+        fillOpacity: 0.7,
+        weight: 1
+      }});
+
+      const title = p.name || p.id;
+      const operator = p.operator || '(unknown operator)';
+      const cityState = [p.city, p.state].filter(Boolean).join(', ');
+      const provenance = [
+        p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
+        p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
+        p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
+      ].filter(Boolean).join('<br>');
+      marker.bindPopup(`
+        <strong>${{escapeHtml(title)}}</strong><br>
+        Operator: ${{escapeHtml(operator)}}<br>
+        Location: ${{escapeHtml(cityState)}}<br>
+        Source: ${{escapeHtml(p.source)}}<br>
+        ${{provenance ? provenance + '<br>' : ''}}
+        GEOID: ${{escapeHtml(p.geoid)}}
+      `);
+
+      marker.addTo(map);
+      bounds.push([p.lat, p.lon]);
+    }}
+
+    if (bounds.length > 0) {{
+      map.fitBounds(bounds, {{ padding: [20, 20] }});
+    }}
+
+    document.getElementById('total').textContent = stats.total;
+
+    const sourceStats = document.getElementById('sourceStats');
+    for (const [k, v] of Object.entries(stats.by_source)) {{
+      const div = document.createElement('div');
+      div.className = 'stat-row';
+      div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
+      sourceStats.appendChild(div);
+    }}
+
+    const matchStats = document.getElementById('matchStats');
+    for (const [k, v] of Object.entries(stats.by_match_method)) {{
+      const div = document.createElement('div');
+      div.className = 'stat-row';
+      div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
+      matchStats.appendChild(div);
+    }}
+  </script>
+</body>
+</html>
+"""
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate an interactive HTML map from the PostGIS point table."
+    )
+    parser.add_argument(
+        "--output",
+        default="data_center_map.html",
+        help="Output HTML path (default: data_center_map.html)",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    conn = connect()
+    try:
+        points = load_points(conn)
+    finally:
+        conn.close()
+
+    center_lat, center_lon = compute_center(points)
+    render_html(points, center_lat, center_lon, args.output)
+    print(f"wrote {len(points)} points to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/make_internet_cables_map.py
+++ b/scripts/make_internet_cables_map.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+"""Render a Leaflet HTML map combining US data centers, submarine cables,
+and city-level network-dominance points from PostGIS.
+"""
+import argparse
+import json
+import os
+
+import psycopg2
+
+
+DB_NAME = "data_centers"
+DC_TABLE = "public.master_data_centers"
+CABLES_TABLE = "public.internet_cables"
+CITY_TABLE = "public.internet_city_dominance"
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+
+
+def load_data_centers(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            f"""
+            select
+                master_id,
+                source,
+                coalesce(operator, ''),
+                coalesce(name, ''),
+                coalesce(city, ''),
+                coalesce(state, ''),
+                longitude,
+                latitude
+            from {DC_TABLE}
+            where longitude is not null and latitude is not null
+            """
+        )
+        return [
+            {
+                "id": r[0],
+                "source": r[1],
+                "operator": r[2],
+                "name": r[3],
+                "city": r[4],
+                "state": r[5],
+                "lon": float(r[6]),
+                "lat": float(r[7]),
+            }
+            for r in cur.fetchall()
+        ]
+
+
+def load_cables(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            f"""
+            select
+                feature_id,
+                coalesce(cable_id, ''),
+                coalesce(name, ''),
+                coalesce(color, '#888888'),
+                coalesce(owners, ''),
+                rfs_year,
+                decommission_year,
+                length_km,
+                coalesce(url, ''),
+                ST_AsGeoJSON(geom)
+            from {CABLES_TABLE}
+            where geom is not null
+            """
+        )
+        features = []
+        for r in cur.fetchall():
+            features.append(
+                {
+                    "type": "Feature",
+                    "geometry": json.loads(r[9]),
+                    "properties": {
+                        "feature_id": r[0],
+                        "cable_id": r[1],
+                        "name": r[2],
+                        "color": r[3],
+                        "owners": r[4],
+                        "rfs_year": r[5],
+                        "decommission_year": r[6],
+                        "length_km": float(r[7]) if r[7] is not None else None,
+                        "url": r[8],
+                    },
+                }
+            )
+        return {"type": "FeatureCollection", "features": features}
+
+
+def load_cities(conn, us_only=False):
+    where = "where geom is not null"
+    if us_only:
+        where += " and country = 'US'"
+    with conn.cursor() as cur:
+        cur.execute(
+            f"""
+            select
+                id,
+                coalesce(city, ''),
+                coalesce(country, ''),
+                coalesce(country_name, ''),
+                coalesce(region, ''),
+                physical_capacity_tbps,
+                logical_dominance_ips,
+                longitude,
+                latitude
+            from {CITY_TABLE}
+            {where}
+            """
+        )
+        return [
+            {
+                "id": r[0],
+                "city": r[1],
+                "country": r[2],
+                "country_name": r[3],
+                "region": r[4],
+                "tbps": float(r[5]) if r[5] is not None else None,
+                "ips": int(r[6]) if r[6] is not None else None,
+                "lon": float(r[7]),
+                "lat": float(r[8]),
+            }
+            for r in cur.fetchall()
+        ]
+
+
+def render_html(data_centers, cables_geojson, cities, output_path):
+    payload = json.dumps(
+        {
+            "data_centers": data_centers,
+            "cables": cables_geojson,
+            "cities": cities,
+        }
+    )
+
+    html = """<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <title>US Data Centers + Submarine Cables</title>
+  <link rel="stylesheet" href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css" />
+  <style>
+    html, body { height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }
+    #layout { display: grid; grid-template-columns: 300px 1fr; height: 100%; }
+    #panel { padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; font-size: 13px; }
+    #map { height: 100%; width: 100%; }
+    h1 { margin: 0 0 8px; font-size: 18px; }
+    h2 { margin: 14px 0 6px; font-size: 13px; text-transform: uppercase; color: #555; letter-spacing: 0.04em; }
+    .row { display: flex; justify-content: space-between; padding: 2px 0; }
+    .swatch { width: 12px; height: 12px; display: inline-block; margin-right: 8px; vertical-align: middle; border: 1px solid #ccc; }
+    label.toggle { display: block; padding: 3px 0; cursor: pointer; }
+    @media (max-width: 900px) {
+      #layout { grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }
+      #panel { border-right: 0; border-bottom: 1px solid #ddd; }
+    }
+  </style>
+</head>
+<body>
+  <div id="layout">
+    <div id="panel">
+      <h1>Data Centers + Cables</h1>
+      <div class="row"><span>Data centers</span><strong id="dcCount"></strong></div>
+      <div class="row"><span>Submarine cables</span><strong id="cableCount"></strong></div>
+      <div class="row"><span>City dominance pts</span><strong id="cityCount"></strong></div>
+
+      <h2>Layers</h2>
+      <label class="toggle"><input type="checkbox" id="tDc" checked> Data centers</label>
+      <label class="toggle"><input type="checkbox" id="tCables" checked> Submarine cables</label>
+      <label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
+
+      <h2>Data center source</h2>
+      <div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
+      <div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
+      <div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
+      <div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
+
+      <h2>City dominance</h2>
+      <div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
+    </div>
+    <div id="map"></div>
+  </div>
+
+  <script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js"></script>
+  <script>
+    const DATA = __PAYLOAD__;
+
+    function colorForSource(source) {
+      if (source === 'merged') return '#2ca02c';
+      if (source === 'curated') return '#1f77b4';
+      if (source === 'osm') return '#ff7f0e';
+      return '#7f7f7f';
+    }
+
+    function esc(v) {
+      return String(v == null ? '' : v)
+        .replaceAll('&','&amp;').replaceAll('<','&lt;').replaceAll('>','&gt;')
+        .replaceAll('"','&quot;').replaceAll("'", '&#39;');
+    }
+
+    const map = L.map('map', { preferCanvas: true, worldCopyJump: true }).setView([20, -40], 3);
+
+    L.tileLayer('https://tile.openstreetmap.org/{z}/{x}/{y}.png', {
+      maxZoom: 19,
+      attribution: '&copy; OpenStreetMap contributors'
+    }).addTo(map);
+
+    const cableLayer = L.geoJSON(DATA.cables, {
+      style: f => ({
+        color: f.properties.color || '#888',
+        weight: 1.4,
+        opacity: 0.75,
+      }),
+      onEachFeature: (feature, layer) => {
+        const p = feature.properties;
+        const yrs = [p.rfs_year, p.decommission_year].filter(Boolean).join(' – ');
+        layer.bindPopup(`
+          <strong>${esc(p.name)}</strong><br>
+          ${p.url ? `<a href="${esc(p.url)}" target="_blank" rel="noopener">${esc(p.url)}</a><br>` : ''}
+          Owners: ${esc(p.owners)}<br>
+          ${yrs ? `Years: ${esc(yrs)}<br>` : ''}
+          ${p.length_km ? `Length: ${esc(p.length_km.toLocaleString())} km<br>` : ''}
+          ID: ${esc(p.cable_id || p.feature_id)}
+        `);
+      },
+    }).addTo(map);
+
+    const cityLayer = L.layerGroup();
+    for (const c of DATA.cities) {
+      const tbps = c.tbps || 0;
+      const radius = Math.max(2, Math.min(18, Math.sqrt(tbps) * 1.6));
+      const m = L.circleMarker([c.lat, c.lon], {
+        radius,
+        color: '#6c2a86',
+        fillColor: '#9b59b6',
+        fillOpacity: 0.45,
+        weight: 0.8,
+      });
+      m.bindPopup(`
+        <strong>${esc(c.city)}</strong> (${esc(c.country)})<br>
+        Region: ${esc(c.region)}<br>
+        Physical capacity: ${esc(tbps.toFixed ? tbps.toFixed(2) : tbps)} Tbps<br>
+        Logical dominance IPs: ${esc(c.ips ? c.ips.toLocaleString() : '')}
+      `);
+      cityLayer.addLayer(m);
+    }
+    cityLayer.addTo(map);
+
+    const dcLayer = L.layerGroup();
+    const dcBounds = [];
+    for (const p of DATA.data_centers) {
+      const m = L.circleMarker([p.lat, p.lon], {
+        radius: 3,
+        color: colorForSource(p.source),
+        fillColor: colorForSource(p.source),
+        fillOpacity: 0.85,
+        weight: 0.8,
+      });
+      const title = p.name || p.id;
+      const operator = p.operator || '(unknown operator)';
+      const cityState = [p.city, p.state].filter(Boolean).join(', ');
+      m.bindPopup(`
+        <strong>${esc(title)}</strong><br>
+        Operator: ${esc(operator)}<br>
+        Location: ${esc(cityState)}<br>
+        Source: ${esc(p.source)}
+      `);
+      dcLayer.addLayer(m);
+      dcBounds.push([p.lat, p.lon]);
+    }
+    dcLayer.addTo(map);
+
+    if (dcBounds.length) map.fitBounds(dcBounds, { padding: [30, 30], maxZoom: 5 });
+
+    function toggle(layer, on) {
+      if (on) { if (!map.hasLayer(layer)) layer.addTo(map); }
+      else { if (map.hasLayer(layer)) map.removeLayer(layer); }
+    }
+    document.getElementById('tDc').addEventListener('change', e => toggle(dcLayer, e.target.checked));
+    document.getElementById('tCables').addEventListener('change', e => toggle(cableLayer, e.target.checked));
+    document.getElementById('tCities').addEventListener('change', e => toggle(cityLayer, e.target.checked));
+
+    document.getElementById('dcCount').textContent = DATA.data_centers.length.toLocaleString();
+    document.getElementById('cableCount').textContent = DATA.cables.features.length.toLocaleString();
+    document.getElementById('cityCount').textContent = DATA.cities.length.toLocaleString();
+  </script>
+</body>
+</html>
+"""
+    html = html.replace("__PAYLOAD__", payload)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html)
+
+
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="Render a Leaflet map combining data centers, submarine cables, and city dominance."
+    )
+    p.add_argument("--output", default="data_centers_cables_map.html")
+    p.add_argument(
+        "--us-cities-only",
+        action="store_true",
+        help="Restrict the city-dominance layer to country='US'.",
+    )
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+    conn = connect()
+    try:
+        dcs = load_data_centers(conn)
+        cables = load_cables(conn)
+        cities = load_cities(conn, us_only=args.us_cities_only)
+    finally:
+        conn.close()
+
+    render_html(dcs, cables, cities, args.output)
+    print(
+        f"wrote {len(dcs)} data centers, "
+        f"{len(cables['features'])} cables, "
+        f"{len(cities)} city points -> {args.output}"
+    )
+
+
+if __name__ == "__main__":
+    main()