#!/usr/bin/env python3 """Tract-level analysis of the 'concentrated costs / dispersed benefits' frame for US data-center siting. Cost-bearing universe = tracts that host at least one DC (public.data_center_census_tracts_2024) Comparison universe = ACS 2024 5-yr tracts in the selected states (census_tract_acs_2024_selected_states.csv) """ import csv import math import os import statistics from collections import Counter import psycopg2 CSV_PATH = "census_tract_acs_2024_selected_states.csv" def connect(): return psycopg2.connect( host=os.environ["PGWEB_HOST"], port=os.environ["PGWEB_PORT"], user=os.environ["PGWEB_USER"], password=os.environ["PGWEB_PASSWORD"], dbname="data_centers", ) def gini(values): v = sorted(x for x in values if x is not None and x >= 0) n = len(v) if n == 0 or sum(v) == 0: return None cum = sum(i * x for i, x in enumerate(v, 1)) return (2 * cum) / (n * sum(v)) - (n + 1) / n def hhi(shares): return sum(s * s for s in shares) def median(xs): xs = [x for x in xs if x is not None] return statistics.median(xs) if xs else None def mean(xs): xs = [x for x in xs if x is not None] return statistics.mean(xs) if xs else None def wmean(xs, ws): pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0] if not pairs: return None total = sum(w for _, w in pairs) return sum(x * w for x, w in pairs) / total def to_float(s): try: return float(s) except (TypeError, ValueError): return None def to_int(s): try: return int(float(s)) except (TypeError, ValueError): return None def main(): conn = connect() cur = conn.cursor() # DC-hosting tracts (the cost-bearing universe) ---------------------- cur.execute( """ select geoid, statefp, data_center_count, population, households, broadband_subscription_pct, median_household_income, per_capita_income, poverty_rate, non_hispanic_white_pct, non_hispanic_black_pct, hispanic_latino_pct, non_hispanic_asian_pct, primary_industry, land_area_sqm, industry_information_workers, industry_total_workers from public.data_center_census_tracts_2024 """ ) dc_tracts = [] for r in cur.fetchall(): dc_tracts.append( { "geoid": r[0], "statefp": r[1], "dc_count": r[2] or 0, "pop": r[3], "hh": r[4], "broadband_pct": float(r[5]) if r[5] is not None else None, "mhi": r[6], "pci": r[7], "poverty": float(r[8]) if r[8] is not None else None, "white_pct": float(r[9]) if r[9] is not None else None, "black_pct": float(r[10]) if r[10] is not None else None, "hisp_pct": float(r[11]) if r[11] is not None else None, "asian_pct": float(r[12]) if r[12] is not None else None, "primary_industry": r[13], "land_sqm": r[14], "info_workers": r[15], "total_workers": r[16], } ) # Distance from each DC tract to nearest cable (km) ---------------- cur.execute( """ with cables as (select ST_Union(geom)::geography g from public.internet_cables) select t.geoid, ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0 from public.data_center_census_tracts_2024 t, cables c """ ) dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()} for t in dc_tracts: t["dist_km"] = dist_by_geoid.get(t["geoid"]) cur.close() conn.close() # Comparison universe from the wider ACS CSV ------------------------ universe = [] with open(CSV_PATH, newline="", encoding="utf-8") as f: for row in csv.DictReader(f): universe.append( { "geoid": row["geoid"], "statefp": row["statefp"], "pop": to_int(row["population"]), "broadband_pct": to_float(row["broadband_subscription_pct"]), "mhi": to_int(row["median_household_income"]), "pci": to_int(row["per_capita_income"]), "poverty": to_float(row["poverty_rate"]), "white_pct": to_float(row["non_hispanic_white_pct"]), "black_pct": to_float(row["non_hispanic_black_pct"]), "hisp_pct": to_float(row["hispanic_latino_pct"]), "asian_pct": to_float(row["non_hispanic_asian_pct"]), } ) dc_geoids = {t["geoid"] for t in dc_tracts} non_dc = [u for u in universe if u["geoid"] not in dc_geoids] # Restrict comparison to states actually represented in the DC sample dc_states = {t["statefp"] for t in dc_tracts} universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states] non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states] # ============== report ============== print("=" * 72) print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS") print("=" * 72) total_dc = sum(t["dc_count"] for t in dc_tracts) print(f"\nDC-hosting tracts: {len(dc_tracts):,}") print(f"Data centers in those tracts: {total_dc:,}") print(f"ACS universe (selected states): {len(universe):,} tracts") print(f"States represented in DC sample: {len(dc_states)}") print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts") # --- Cost concentration at the tract level --- print("\n" + "-" * 72) print("1. COST CONCENTRATION (DCs across tracts)") print("-" * 72) counts = [t["dc_count"] for t in dc_tracts] shares = [c / total_dc for c in counts] g_dc = gini(counts) h_dc = hhi(shares) print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}") print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}") # Top 1% / 5% of tracts share top1 = max(1, len(counts) // 100) top5 = max(1, len(counts) // 20) s = sorted(counts, reverse=True) print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold " f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs") print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold " f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs") print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold " f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs") # How small a fraction of population lives in a DC tract? pop_dc = sum(t["pop"] or 0 for t in dc_tracts) pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states) print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}") print(f"Total population (DC-states ACS universe): {pop_universe:>11,}") if pop_universe: print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states " f"live in a DC-hosting tract") # Per-capita DC density if pop_dc: print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts") if pop_universe and total_dc: print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents " f"averaged across DC-state population") # --- Profile of cost-bearing communities --- print("\n" + "-" * 72) print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)") print("-" * 72) fields = [ ("Median household income ($)", "mhi", "{:>10,.0f}"), ("Per-capita income ($)", "pci", "{:>10,.0f}"), ("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"), ("Poverty rate (%)", "poverty", "{:>10,.1f}"), ("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"), ("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"), ("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"), ("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"), ] label_w = max(len(lbl) for lbl, *_ in fields) print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} " f"{'Δ (DC − peer)':>15}") for label, key, fmt in fields: dc_med = median([t[key] for t in dc_tracts]) peer_med = median([u[key] for u in non_dc_in_dc_states]) if dc_med is None or peer_med is None: continue delta = dc_med - peer_med cell_dc = fmt.format(dc_med) cell_pe = fmt.format(peer_med) cell_dl = fmt.format(delta) print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}") print("\nPopulation-weighted means (DC tracts):") pops = [t["pop"] for t in dc_tracts] for label, key, _ in fields: wm = wmean([t[key] for t in dc_tracts], pops) if wm is not None: print(f" {label:<{label_w}} {wm:>12,.1f}") print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):") for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10): print(f" {n:>4} {industry}") # --- Cable vs. inland subgroups --- print("\n" + "-" * 72) print("3. CABLE-ADJACENT vs. INLAND DC TRACTS") print("-" * 72) near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100] far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100] print(f"≤100 km from a submarine cable: {len(near):>3} tracts, " f"{sum(t['dc_count'] for t in near):>4} DCs") print(f">100 km from a submarine cable: {len(far):>3} tracts, " f"{sum(t['dc_count'] for t in far):>4} DCs") if near and far: print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} " f"far={median([t['mhi'] for t in far]):>10,.0f}") print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} " f"far={median([t['broadband_pct'] for t in far]):>10,.1f}") print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} " f"far={median([t['dc_count'] for t in far]):>10,.0f}") # --- Benefit-side proxy --- print("\n" + "-" * 72) print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)") print("-" * 72) # Total broadband subscribers approx = households * broadband_pct subs = [] for u in universe_in_dc_states: hh = None # households not in CSV; use population/2.5 fallback if u["pop"] and u["broadband_pct"] is not None: est_hh = u["pop"] / 2.5 subs.append(est_hh * u["broadband_pct"] / 100.0) total_subs = sum(subs) sg = gini(subs) sh = hhi([s / total_subs for s in subs]) if total_subs else None print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}") print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}") print(f"HHI of subscribers across tracts: {sh:.5f}") # Compare to DC HHI print(f"\nSide-by-side concentration (lower = more dispersed):") print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}") print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} " f"({h_dc/sh:.0f}x more concentrated for DCs)") print("\n" + "=" * 72) print("BOTTOM LINE") print("=" * 72) n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5) print(f""" - DCs are extremely concentrated at the tract level: top 1% of host tracts hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%. - Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually live in a DC-hosting tract — costs (land use, power draw, water, traffic, noise) fall on a tiny minority of communities. - DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer tracts. See deltas above for the demographic profile. - Broadband subscribers (proxy for who consumes cloud services) are far more evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower. That asymmetry IS the classic concentrated-cost / dispersed-benefit shape. """) if __name__ == "__main__": main()