Files
data-centers/scripts/analyze_dc_tract_concentration.py
dadams 6db5e0fff8 Fix path references in scripts after reorganization
Update 8 scripts to use Path(__file__).parent.parent as PROJECT_ROOT
so they resolve data/, output/, and internet_cables/ relative to the
project root rather than the caller's working directory.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:47 -07:00

323 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
for US data-center siting.
Cost-bearing universe = tracts that host at least one DC
(public.data_center_census_tracts_2024)
Comparison universe = ACS 2024 5-yr tracts in the selected states
(census_tract_acs_2024_selected_states.csv)
"""
import csv
import math
import os
import statistics
from collections import Counter
from pathlib import Path
import psycopg2
PROJECT_ROOT = Path(__file__).parent.parent
CSV_PATH = PROJECT_ROOT / "data" / "census_tract_acs_2024_selected_states.csv"
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
def gini(values):
v = sorted(x for x in values if x is not None and x >= 0)
n = len(v)
if n == 0 or sum(v) == 0:
return None
cum = sum(i * x for i, x in enumerate(v, 1))
return (2 * cum) / (n * sum(v)) - (n + 1) / n
def hhi(shares):
return sum(s * s for s in shares)
def median(xs):
xs = [x for x in xs if x is not None]
return statistics.median(xs) if xs else None
def mean(xs):
xs = [x for x in xs if x is not None]
return statistics.mean(xs) if xs else None
def wmean(xs, ws):
pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
if not pairs:
return None
total = sum(w for _, w in pairs)
return sum(x * w for x, w in pairs) / total
def to_float(s):
try:
return float(s)
except (TypeError, ValueError):
return None
def to_int(s):
try:
return int(float(s))
except (TypeError, ValueError):
return None
def main():
conn = connect()
cur = conn.cursor()
# DC-hosting tracts (the cost-bearing universe) ----------------------
cur.execute(
"""
select
geoid,
statefp,
data_center_count,
population,
households,
broadband_subscription_pct,
median_household_income,
per_capita_income,
poverty_rate,
non_hispanic_white_pct,
non_hispanic_black_pct,
hispanic_latino_pct,
non_hispanic_asian_pct,
primary_industry,
land_area_sqm,
industry_information_workers,
industry_total_workers
from public.data_center_census_tracts_2024
"""
)
dc_tracts = []
for r in cur.fetchall():
dc_tracts.append(
{
"geoid": r[0],
"statefp": r[1],
"dc_count": r[2] or 0,
"pop": r[3],
"hh": r[4],
"broadband_pct": float(r[5]) if r[5] is not None else None,
"mhi": r[6],
"pci": r[7],
"poverty": float(r[8]) if r[8] is not None else None,
"white_pct": float(r[9]) if r[9] is not None else None,
"black_pct": float(r[10]) if r[10] is not None else None,
"hisp_pct": float(r[11]) if r[11] is not None else None,
"asian_pct": float(r[12]) if r[12] is not None else None,
"primary_industry": r[13],
"land_sqm": r[14],
"info_workers": r[15],
"total_workers": r[16],
}
)
# Distance from each DC tract to nearest cable (km) ----------------
cur.execute(
"""
with cables as (select ST_Union(geom)::geography g from public.internet_cables)
select t.geoid,
ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
from public.data_center_census_tracts_2024 t, cables c
"""
)
dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
for t in dc_tracts:
t["dist_km"] = dist_by_geoid.get(t["geoid"])
cur.close()
conn.close()
# Comparison universe from the wider ACS CSV ------------------------
universe = []
with open(CSV_PATH, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
universe.append(
{
"geoid": row["geoid"],
"statefp": row["statefp"],
"pop": to_int(row["population"]),
"broadband_pct": to_float(row["broadband_subscription_pct"]),
"mhi": to_int(row["median_household_income"]),
"pci": to_int(row["per_capita_income"]),
"poverty": to_float(row["poverty_rate"]),
"white_pct": to_float(row["non_hispanic_white_pct"]),
"black_pct": to_float(row["non_hispanic_black_pct"]),
"hisp_pct": to_float(row["hispanic_latino_pct"]),
"asian_pct": to_float(row["non_hispanic_asian_pct"]),
}
)
dc_geoids = {t["geoid"] for t in dc_tracts}
non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
# Restrict comparison to states actually represented in the DC sample
dc_states = {t["statefp"] for t in dc_tracts}
universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
# ============== report ==============
print("=" * 72)
print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
print("=" * 72)
total_dc = sum(t["dc_count"] for t in dc_tracts)
print(f"\nDC-hosting tracts: {len(dc_tracts):,}")
print(f"Data centers in those tracts: {total_dc:,}")
print(f"ACS universe (selected states): {len(universe):,} tracts")
print(f"States represented in DC sample: {len(dc_states)}")
print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts")
# --- Cost concentration at the tract level ---
print("\n" + "-" * 72)
print("1. COST CONCENTRATION (DCs across tracts)")
print("-" * 72)
counts = [t["dc_count"] for t in dc_tracts]
shares = [c / total_dc for c in counts]
g_dc = gini(counts)
h_dc = hhi(shares)
print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}")
print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}")
# Top 1% / 5% of tracts share
top1 = max(1, len(counts) // 100)
top5 = max(1, len(counts) // 20)
s = sorted(counts, reverse=True)
print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold "
f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold "
f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
# How small a fraction of population lives in a DC tract?
pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}")
print(f"Total population (DC-states ACS universe): {pop_universe:>11,}")
if pop_universe:
print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
f"live in a DC-hosting tract")
# Per-capita DC density
if pop_dc:
print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
if pop_universe and total_dc:
print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
f"averaged across DC-state population")
# --- Profile of cost-bearing communities ---
print("\n" + "-" * 72)
print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
print("-" * 72)
fields = [
("Median household income ($)", "mhi", "{:>10,.0f}"),
("Per-capita income ($)", "pci", "{:>10,.0f}"),
("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"),
("Poverty rate (%)", "poverty", "{:>10,.1f}"),
("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"),
("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"),
("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"),
("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"),
]
label_w = max(len(lbl) for lbl, *_ in fields)
print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} "
f"{'Δ (DC peer)':>15}")
for label, key, fmt in fields:
dc_med = median([t[key] for t in dc_tracts])
peer_med = median([u[key] for u in non_dc_in_dc_states])
if dc_med is None or peer_med is None:
continue
delta = dc_med - peer_med
cell_dc = fmt.format(dc_med)
cell_pe = fmt.format(peer_med)
cell_dl = fmt.format(delta)
print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}")
print("\nPopulation-weighted means (DC tracts):")
pops = [t["pop"] for t in dc_tracts]
for label, key, _ in fields:
wm = wmean([t[key] for t in dc_tracts], pops)
if wm is not None:
print(f" {label:<{label_w}} {wm:>12,.1f}")
print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
print(f" {n:>4} {industry}")
# --- Cable vs. inland subgroups ---
print("\n" + "-" * 72)
print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
print("-" * 72)
near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
f"{sum(t['dc_count'] for t in near):>4} DCs")
print(f">100 km from a submarine cable: {len(far):>3} tracts, "
f"{sum(t['dc_count'] for t in far):>4} DCs")
if near and far:
print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} "
f"far={median([t['mhi'] for t in far]):>10,.0f}")
print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} "
f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} "
f"far={median([t['dc_count'] for t in far]):>10,.0f}")
# --- Benefit-side proxy ---
print("\n" + "-" * 72)
print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
print("-" * 72)
# Total broadband subscribers approx = households * broadband_pct
subs = []
for u in universe_in_dc_states:
hh = None # households not in CSV; use population/2.5 fallback
if u["pop"] and u["broadband_pct"] is not None:
est_hh = u["pop"] / 2.5
subs.append(est_hh * u["broadband_pct"] / 100.0)
total_subs = sum(subs)
sg = gini(subs)
sh = hhi([s / total_subs for s in subs]) if total_subs else None
print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}")
print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}")
print(f"HHI of subscribers across tracts: {sh:.5f}")
# Compare to DC HHI
print(f"\nSide-by-side concentration (lower = more dispersed):")
print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}")
print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} "
f"({h_dc/sh:.0f}x more concentrated for DCs)")
print("\n" + "=" * 72)
print("BOTTOM LINE")
print("=" * 72)
n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
print(f"""
- DCs are extremely concentrated at the tract level: top 1% of host tracts
hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
live in a DC-hosting tract — costs (land use, power draw, water, traffic,
noise) fall on a tiny minority of communities.
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
tracts. See deltas above for the demographic profile.
- Broadband subscribers (proxy for who consumes cloud services) are far more
evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
""")
if __name__ == "__main__":
main()