Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions
--- a/scripts/analyze_dc_tract_concentration.py
+++ b/scripts/analyze_dc_tract_concentration.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
+for US data-center siting.
+
+Cost-bearing universe = tracts that host at least one DC
+  (public.data_center_census_tracts_2024)
+Comparison universe = ACS 2024 5-yr tracts in the selected states
+  (census_tract_acs_2024_selected_states.csv)
+"""
+import csv
+import math
+import os
+import statistics
+from collections import Counter
+
+import psycopg2
+
+
+CSV_PATH = "census_tract_acs_2024_selected_states.csv"
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+def gini(values):
+    v = sorted(x for x in values if x is not None and x >= 0)
+    n = len(v)
+    if n == 0 or sum(v) == 0:
+        return None
+    cum = sum(i * x for i, x in enumerate(v, 1))
+    return (2 * cum) / (n * sum(v)) - (n + 1) / n
+
+
+def hhi(shares):
+    return sum(s * s for s in shares)
+
+
+def median(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.median(xs) if xs else None
+
+
+def mean(xs):
+    xs = [x for x in xs if x is not None]
+    return statistics.mean(xs) if xs else None
+
+
+def wmean(xs, ws):
+    pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
+    if not pairs:
+        return None
+    total = sum(w for _, w in pairs)
+    return sum(x * w for x, w in pairs) / total
+
+
+def to_float(s):
+    try:
+        return float(s)
+    except (TypeError, ValueError):
+        return None
+
+
+def to_int(s):
+    try:
+        return int(float(s))
+    except (TypeError, ValueError):
+        return None
+
+
+def main():
+    conn = connect()
+    cur = conn.cursor()
+
+    # DC-hosting tracts (the cost-bearing universe) ----------------------
+    cur.execute(
+        """
+        select
+            geoid,
+            statefp,
+            data_center_count,
+            population,
+            households,
+            broadband_subscription_pct,
+            median_household_income,
+            per_capita_income,
+            poverty_rate,
+            non_hispanic_white_pct,
+            non_hispanic_black_pct,
+            hispanic_latino_pct,
+            non_hispanic_asian_pct,
+            primary_industry,
+            land_area_sqm,
+            industry_information_workers,
+            industry_total_workers
+        from public.data_center_census_tracts_2024
+        """
+    )
+    dc_tracts = []
+    for r in cur.fetchall():
+        dc_tracts.append(
+            {
+                "geoid": r[0],
+                "statefp": r[1],
+                "dc_count": r[2] or 0,
+                "pop": r[3],
+                "hh": r[4],
+                "broadband_pct": float(r[5]) if r[5] is not None else None,
+                "mhi": r[6],
+                "pci": r[7],
+                "poverty": float(r[8]) if r[8] is not None else None,
+                "white_pct": float(r[9]) if r[9] is not None else None,
+                "black_pct": float(r[10]) if r[10] is not None else None,
+                "hisp_pct": float(r[11]) if r[11] is not None else None,
+                "asian_pct": float(r[12]) if r[12] is not None else None,
+                "primary_industry": r[13],
+                "land_sqm": r[14],
+                "info_workers": r[15],
+                "total_workers": r[16],
+            }
+        )
+
+    # Distance from each DC tract to nearest cable (km) ----------------
+    cur.execute(
+        """
+        with cables as (select ST_Union(geom)::geography g from public.internet_cables)
+        select t.geoid,
+               ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
+        from public.data_center_census_tracts_2024 t, cables c
+        """
+    )
+    dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
+    for t in dc_tracts:
+        t["dist_km"] = dist_by_geoid.get(t["geoid"])
+
+    cur.close()
+    conn.close()
+
+    # Comparison universe from the wider ACS CSV ------------------------
+    universe = []
+    with open(CSV_PATH, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            universe.append(
+                {
+                    "geoid": row["geoid"],
+                    "statefp": row["statefp"],
+                    "pop": to_int(row["population"]),
+                    "broadband_pct": to_float(row["broadband_subscription_pct"]),
+                    "mhi": to_int(row["median_household_income"]),
+                    "pci": to_int(row["per_capita_income"]),
+                    "poverty": to_float(row["poverty_rate"]),
+                    "white_pct": to_float(row["non_hispanic_white_pct"]),
+                    "black_pct": to_float(row["non_hispanic_black_pct"]),
+                    "hisp_pct": to_float(row["hispanic_latino_pct"]),
+                    "asian_pct": to_float(row["non_hispanic_asian_pct"]),
+                }
+            )
+
+    dc_geoids = {t["geoid"] for t in dc_tracts}
+    non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
+
+    # Restrict comparison to states actually represented in the DC sample
+    dc_states = {t["statefp"] for t in dc_tracts}
+    universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
+    non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
+
+    # ============== report ==============
+    print("=" * 72)
+    print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
+    print("=" * 72)
+
+    total_dc = sum(t["dc_count"] for t in dc_tracts)
+    print(f"\nDC-hosting tracts:                  {len(dc_tracts):,}")
+    print(f"Data centers in those tracts:       {total_dc:,}")
+    print(f"ACS universe (selected states):     {len(universe):,} tracts")
+    print(f"States represented in DC sample:    {len(dc_states)}")
+    print(f"Universe restricted to DC states:   {len(universe_in_dc_states):,} tracts")
+
+    # --- Cost concentration at the tract level ---
+    print("\n" + "-" * 72)
+    print("1. COST CONCENTRATION (DCs across tracts)")
+    print("-" * 72)
+    counts = [t["dc_count"] for t in dc_tracts]
+    shares = [c / total_dc for c in counts]
+    g_dc = gini(counts)
+    h_dc = hhi(shares)
+    print(f"Gini of DC counts across DC-hosting tracts:       {g_dc:.3f}")
+    print(f"HHI of DC shares across DC-hosting tracts:        {h_dc:.4f}")
+    # Top 1% / 5% of tracts share
+    top1 = max(1, len(counts) // 100)
+    top5 = max(1, len(counts) // 20)
+    s = sorted(counts, reverse=True)
+    print(f"Top  1% of DC-hosting tracts ({top1:>3} tracts) hold "
+          f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
+    print(f"Top  5% of DC-hosting tracts ({top5:>3} tracts) hold "
+          f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
+    print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
+          f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
+
+    # How small a fraction of population lives in a DC tract?
+    pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
+    pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
+    print(f"\nPopulation living in a DC-hosting tract:          {pop_dc:>11,}")
+    print(f"Total population (DC-states ACS universe):        {pop_universe:>11,}")
+    if pop_universe:
+        print(f"  -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
+              f"live in a DC-hosting tract")
+    # Per-capita DC density
+    if pop_dc:
+        print(f"  -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
+    if pop_universe and total_dc:
+        print(f"     vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
+              f"averaged across DC-state population")
+
+    # --- Profile of cost-bearing communities ---
+    print("\n" + "-" * 72)
+    print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
+    print("-" * 72)
+    fields = [
+        ("Median household income ($)",      "mhi",            "{:>10,.0f}"),
+        ("Per-capita income ($)",            "pci",            "{:>10,.0f}"),
+        ("Broadband subscription (%)",       "broadband_pct",  "{:>10,.1f}"),
+        ("Poverty rate (%)",                 "poverty",        "{:>10,.1f}"),
+        ("Non-Hispanic White (%)",           "white_pct",      "{:>10,.1f}"),
+        ("Non-Hispanic Black (%)",           "black_pct",      "{:>10,.1f}"),
+        ("Hispanic/Latino (%)",              "hisp_pct",       "{:>10,.1f}"),
+        ("Non-Hispanic Asian (%)",           "asian_pct",      "{:>10,.1f}"),
+    ]
+    label_w = max(len(lbl) for lbl, *_ in fields)
+    print(f"{'Field':<{label_w}}  {'DC tracts':>12}  {'Non-DC peers':>14}  "
+          f"{'Δ (DC − peer)':>15}")
+    for label, key, fmt in fields:
+        dc_med = median([t[key] for t in dc_tracts])
+        peer_med = median([u[key] for u in non_dc_in_dc_states])
+        if dc_med is None or peer_med is None:
+            continue
+        delta = dc_med - peer_med
+        cell_dc = fmt.format(dc_med)
+        cell_pe = fmt.format(peer_med)
+        cell_dl = fmt.format(delta)
+        print(f"{label:<{label_w}}  {cell_dc}  {cell_pe}  {cell_dl}")
+
+    print("\nPopulation-weighted means (DC tracts):")
+    pops = [t["pop"] for t in dc_tracts]
+    for label, key, _ in fields:
+        wm = wmean([t[key] for t in dc_tracts], pops)
+        if wm is not None:
+            print(f"  {label:<{label_w}}  {wm:>12,.1f}")
+
+    print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
+    for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
+        print(f"  {n:>4}  {industry}")
+
+    # --- Cable vs. inland subgroups ---
+    print("\n" + "-" * 72)
+    print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
+    print("-" * 72)
+    near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
+    far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
+    print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
+          f"{sum(t['dc_count'] for t in near):>4} DCs")
+    print(f">100 km from a submarine cable: {len(far):>3} tracts, "
+          f"{sum(t['dc_count'] for t in far):>4} DCs")
+    if near and far:
+        print(f"{'  Median MHI':<28}  near={median([t['mhi'] for t in near]):>10,.0f}  "
+              f"far={median([t['mhi'] for t in far]):>10,.0f}")
+        print(f"{'  Median broadband %':<28}  near={median([t['broadband_pct'] for t in near]):>10,.1f}  "
+              f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
+        print(f"{'  Median DC count':<28}  near={median([t['dc_count'] for t in near]):>10,.0f}  "
+              f"far={median([t['dc_count'] for t in far]):>10,.0f}")
+
+    # --- Benefit-side proxy ---
+    print("\n" + "-" * 72)
+    print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
+    print("-" * 72)
+    # Total broadband subscribers approx = households * broadband_pct
+    subs = []
+    for u in universe_in_dc_states:
+        hh = None  # households not in CSV; use population/2.5 fallback
+        if u["pop"] and u["broadband_pct"] is not None:
+            est_hh = u["pop"] / 2.5
+            subs.append(est_hh * u["broadband_pct"] / 100.0)
+    total_subs = sum(subs)
+    sg = gini(subs)
+    sh = hhi([s / total_subs for s in subs]) if total_subs else None
+    print(f"Estimated total broadband subscribers (DC states):  {total_subs:>14,.0f}")
+    print(f"Gini of subscribers across {len(subs):,} tracts:       {sg:.3f}")
+    print(f"HHI of subscribers across tracts:                   {sh:.5f}")
+    # Compare to DC HHI
+    print(f"\nSide-by-side concentration (lower = more dispersed):")
+    print(f"  HHI of DCs across DC-hosting tracts:            {h_dc:.4f}")
+    print(f"  HHI of broadband subs across DC-state tracts:   {sh:.5f}  "
+          f"({h_dc/sh:.0f}x more concentrated for DCs)")
+
+    print("\n" + "=" * 72)
+    print("BOTTOM LINE")
+    print("=" * 72)
+    n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
+    print(f"""
+- DCs are extremely concentrated at the tract level: top 1% of host tracts
+  hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
+- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
+  live in a DC-hosting tract — costs (land use, power draw, water, traffic,
+  noise) fall on a tiny minority of communities.
+- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
+  tracts. See deltas above for the demographic profile.
+- Broadband subscribers (proxy for who consumes cloud services) are far more
+  evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
+  That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
+""")
+
+
+if __name__ == "__main__":
+    main()