data-centers/scripts/analyze_dc_tract_concentration.py

#!/usr/bin/env python3
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
for US data-center siting.

Cost-bearing universe = tracts that host at least one DC
  (public.data_center_census_tracts_2024)
Comparison universe = ACS 2024 5-yr tracts in the selected states
  (census_tract_acs_2024_selected_states.csv)
"""
import csv
import math
import os
import statistics
from collections import Counter
from pathlib import Path

import psycopg2


PROJECT_ROOT = Path(__file__).parent.parent
CSV_PATH = PROJECT_ROOT / "data" / "census_tract_acs_2024_selected_states.csv"


def connect():
    return psycopg2.connect(
        host=os.environ["PGWEB_HOST"],
        port=os.environ["PGWEB_PORT"],
        user=os.environ["PGWEB_USER"],
        password=os.environ["PGWEB_PASSWORD"],
        dbname="data_centers",
    )


def gini(values):
    v = sorted(x for x in values if x is not None and x >= 0)
    n = len(v)
    if n == 0 or sum(v) == 0:
        return None
    cum = sum(i * x for i, x in enumerate(v, 1))
    return (2 * cum) / (n * sum(v)) - (n + 1) / n


def hhi(shares):
    return sum(s * s for s in shares)


def median(xs):
    xs = [x for x in xs if x is not None]
    return statistics.median(xs) if xs else None


def mean(xs):
    xs = [x for x in xs if x is not None]
    return statistics.mean(xs) if xs else None


def wmean(xs, ws):
    pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
    if not pairs:
        return None
    total = sum(w for _, w in pairs)
    return sum(x * w for x, w in pairs) / total


def to_float(s):
    try:
        return float(s)
    except (TypeError, ValueError):
        return None


def to_int(s):
    try:
        return int(float(s))
    except (TypeError, ValueError):
        return None


def main():
    conn = connect()
    cur = conn.cursor()

    # DC-hosting tracts (the cost-bearing universe) ----------------------
    cur.execute(
        """
        select
            geoid,
            statefp,
            data_center_count,
            population,
            households,
            broadband_subscription_pct,
            median_household_income,
            per_capita_income,
            poverty_rate,
            non_hispanic_white_pct,
            non_hispanic_black_pct,
            hispanic_latino_pct,
            non_hispanic_asian_pct,
            primary_industry,
            land_area_sqm,
            industry_information_workers,
            industry_total_workers
        from public.data_center_census_tracts_2024
        """
    )
    dc_tracts = []
    for r in cur.fetchall():
        dc_tracts.append(
            {
                "geoid": r[0],
                "statefp": r[1],
                "dc_count": r[2] or 0,
                "pop": r[3],
                "hh": r[4],
                "broadband_pct": float(r[5]) if r[5] is not None else None,
                "mhi": r[6],
                "pci": r[7],
                "poverty": float(r[8]) if r[8] is not None else None,
                "white_pct": float(r[9]) if r[9] is not None else None,
                "black_pct": float(r[10]) if r[10] is not None else None,
                "hisp_pct": float(r[11]) if r[11] is not None else None,
                "asian_pct": float(r[12]) if r[12] is not None else None,
                "primary_industry": r[13],
                "land_sqm": r[14],
                "info_workers": r[15],
                "total_workers": r[16],
            }
        )

    # Distance from each DC tract to nearest cable (km) ----------------
    cur.execute(
        """
        with cables as (select ST_Union(geom)::geography g from public.internet_cables)
        select t.geoid,
               ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
        from public.data_center_census_tracts_2024 t, cables c
        """
    )
    dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
    for t in dc_tracts:
        t["dist_km"] = dist_by_geoid.get(t["geoid"])

    cur.close()
    conn.close()

    # Comparison universe from the wider ACS CSV ------------------------
    universe = []
    with open(CSV_PATH, newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
            universe.append(
                {
                    "geoid": row["geoid"],
                    "statefp": row["statefp"],
                    "pop": to_int(row["population"]),
                    "broadband_pct": to_float(row["broadband_subscription_pct"]),
                    "mhi": to_int(row["median_household_income"]),
                    "pci": to_int(row["per_capita_income"]),
                    "poverty": to_float(row["poverty_rate"]),
                    "white_pct": to_float(row["non_hispanic_white_pct"]),
                    "black_pct": to_float(row["non_hispanic_black_pct"]),
                    "hisp_pct": to_float(row["hispanic_latino_pct"]),
                    "asian_pct": to_float(row["non_hispanic_asian_pct"]),
                }
            )

    dc_geoids = {t["geoid"] for t in dc_tracts}
    non_dc = [u for u in universe if u["geoid"] not in dc_geoids]

    # Restrict comparison to states actually represented in the DC sample
    dc_states = {t["statefp"] for t in dc_tracts}
    universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
    non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]

    # ============== report ==============
    print("=" * 72)
    print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
    print("=" * 72)

    total_dc = sum(t["dc_count"] for t in dc_tracts)
    print(f"\nDC-hosting tracts:                  {len(dc_tracts):,}")
    print(f"Data centers in those tracts:       {total_dc:,}")
    print(f"ACS universe (selected states):     {len(universe):,} tracts")
    print(f"States represented in DC sample:    {len(dc_states)}")
    print(f"Universe restricted to DC states:   {len(universe_in_dc_states):,} tracts")

    # --- Cost concentration at the tract level ---
    print("\n" + "-" * 72)
    print("1. COST CONCENTRATION (DCs across tracts)")
    print("-" * 72)
    counts = [t["dc_count"] for t in dc_tracts]
    shares = [c / total_dc for c in counts]
    g_dc = gini(counts)
    h_dc = hhi(shares)
    print(f"Gini of DC counts across DC-hosting tracts:       {g_dc:.3f}")
    print(f"HHI of DC shares across DC-hosting tracts:        {h_dc:.4f}")
    # Top 1% / 5% of tracts share
    top1 = max(1, len(counts) // 100)
    top5 = max(1, len(counts) // 20)
    s = sorted(counts, reverse=True)
    print(f"Top  1% of DC-hosting tracts ({top1:>3} tracts) hold "
          f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
    print(f"Top  5% of DC-hosting tracts ({top5:>3} tracts) hold "
          f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
    print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
          f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")

    # How small a fraction of population lives in a DC tract?
    pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
    pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
    print(f"\nPopulation living in a DC-hosting tract:          {pop_dc:>11,}")
    print(f"Total population (DC-states ACS universe):        {pop_universe:>11,}")
    if pop_universe:
        print(f"  -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
              f"live in a DC-hosting tract")
    # Per-capita DC density
    if pop_dc:
        print(f"  -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
    if pop_universe and total_dc:
        print(f"     vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
              f"averaged across DC-state population")

    # --- Profile of cost-bearing communities ---
    print("\n" + "-" * 72)
    print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
    print("-" * 72)
    fields = [
        ("Median household income ($)",      "mhi",            "{:>10,.0f}"),
        ("Per-capita income ($)",            "pci",            "{:>10,.0f}"),
        ("Broadband subscription (%)",       "broadband_pct",  "{:>10,.1f}"),
        ("Poverty rate (%)",                 "poverty",        "{:>10,.1f}"),
        ("Non-Hispanic White (%)",           "white_pct",      "{:>10,.1f}"),
        ("Non-Hispanic Black (%)",           "black_pct",      "{:>10,.1f}"),
        ("Hispanic/Latino (%)",              "hisp_pct",       "{:>10,.1f}"),
        ("Non-Hispanic Asian (%)",           "asian_pct",      "{:>10,.1f}"),
    ]
    label_w = max(len(lbl) for lbl, *_ in fields)
    print(f"{'Field':<{label_w}}  {'DC tracts':>12}  {'Non-DC peers':>14}  "
          f"{'Δ (DC − peer)':>15}")
    for label, key, fmt in fields:
        dc_med = median([t[key] for t in dc_tracts])
        peer_med = median([u[key] for u in non_dc_in_dc_states])
        if dc_med is None or peer_med is None:
            continue
        delta = dc_med - peer_med
        cell_dc = fmt.format(dc_med)
        cell_pe = fmt.format(peer_med)
        cell_dl = fmt.format(delta)
        print(f"{label:<{label_w}}  {cell_dc}  {cell_pe}  {cell_dl}")

    print("\nPopulation-weighted means (DC tracts):")
    pops = [t["pop"] for t in dc_tracts]
    for label, key, _ in fields:
        wm = wmean([t[key] for t in dc_tracts], pops)
        if wm is not None:
            print(f"  {label:<{label_w}}  {wm:>12,.1f}")

    print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
    for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
        print(f"  {n:>4}  {industry}")

    # --- Cable vs. inland subgroups ---
    print("\n" + "-" * 72)
    print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
    print("-" * 72)
    near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
    far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
    print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
          f"{sum(t['dc_count'] for t in near):>4} DCs")
    print(f">100 km from a submarine cable: {len(far):>3} tracts, "
          f"{sum(t['dc_count'] for t in far):>4} DCs")
    if near and far:
        print(f"{'  Median MHI':<28}  near={median([t['mhi'] for t in near]):>10,.0f}  "
              f"far={median([t['mhi'] for t in far]):>10,.0f}")
        print(f"{'  Median broadband %':<28}  near={median([t['broadband_pct'] for t in near]):>10,.1f}  "
              f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
        print(f"{'  Median DC count':<28}  near={median([t['dc_count'] for t in near]):>10,.0f}  "
              f"far={median([t['dc_count'] for t in far]):>10,.0f}")

    # --- Benefit-side proxy ---
    print("\n" + "-" * 72)
    print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
    print("-" * 72)
    # Total broadband subscribers approx = households * broadband_pct
    subs = []
    for u in universe_in_dc_states:
        hh = None  # households not in CSV; use population/2.5 fallback
        if u["pop"] and u["broadband_pct"] is not None:
            est_hh = u["pop"] / 2.5
            subs.append(est_hh * u["broadband_pct"] / 100.0)
    total_subs = sum(subs)
    sg = gini(subs)
    sh = hhi([s / total_subs for s in subs]) if total_subs else None
    print(f"Estimated total broadband subscribers (DC states):  {total_subs:>14,.0f}")
    print(f"Gini of subscribers across {len(subs):,} tracts:       {sg:.3f}")
    print(f"HHI of subscribers across tracts:                   {sh:.5f}")
    # Compare to DC HHI
    print(f"\nSide-by-side concentration (lower = more dispersed):")
    print(f"  HHI of DCs across DC-hosting tracts:            {h_dc:.4f}")
    print(f"  HHI of broadband subs across DC-state tracts:   {sh:.5f}  "
          f"({h_dc/sh:.0f}x more concentrated for DCs)")

    print("\n" + "=" * 72)
    print("BOTTOM LINE")
    print("=" * 72)
    n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
    print(f"""
- DCs are extremely concentrated at the tract level: top 1% of host tracts
  hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
  live in a DC-hosting tract — costs (land use, power draw, water, traffic,
  noise) fall on a tiny minority of communities.
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
  tracts. See deltas above for the demographic profile.
- Broadband subscribers (proxy for who consumes cloud services) are far more
  evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
  That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
""")


if __name__ == "__main__":
    main()