Update 8 scripts to use Path(__file__).parent.parent as PROJECT_ROOT so they resolve data/, output/, and internet_cables/ relative to the project root rather than the caller's working directory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
323 lines
13 KiB
Python
323 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
|
||
for US data-center siting.
|
||
|
||
Cost-bearing universe = tracts that host at least one DC
|
||
(public.data_center_census_tracts_2024)
|
||
Comparison universe = ACS 2024 5-yr tracts in the selected states
|
||
(census_tract_acs_2024_selected_states.csv)
|
||
"""
|
||
import csv
|
||
import math
|
||
import os
|
||
import statistics
|
||
from collections import Counter
|
||
from pathlib import Path
|
||
|
||
import psycopg2
|
||
|
||
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
CSV_PATH = PROJECT_ROOT / "data" / "census_tract_acs_2024_selected_states.csv"
|
||
|
||
|
||
def connect():
|
||
return psycopg2.connect(
|
||
host=os.environ["PGWEB_HOST"],
|
||
port=os.environ["PGWEB_PORT"],
|
||
user=os.environ["PGWEB_USER"],
|
||
password=os.environ["PGWEB_PASSWORD"],
|
||
dbname="data_centers",
|
||
)
|
||
|
||
|
||
def gini(values):
|
||
v = sorted(x for x in values if x is not None and x >= 0)
|
||
n = len(v)
|
||
if n == 0 or sum(v) == 0:
|
||
return None
|
||
cum = sum(i * x for i, x in enumerate(v, 1))
|
||
return (2 * cum) / (n * sum(v)) - (n + 1) / n
|
||
|
||
|
||
def hhi(shares):
|
||
return sum(s * s for s in shares)
|
||
|
||
|
||
def median(xs):
|
||
xs = [x for x in xs if x is not None]
|
||
return statistics.median(xs) if xs else None
|
||
|
||
|
||
def mean(xs):
|
||
xs = [x for x in xs if x is not None]
|
||
return statistics.mean(xs) if xs else None
|
||
|
||
|
||
def wmean(xs, ws):
|
||
pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
|
||
if not pairs:
|
||
return None
|
||
total = sum(w for _, w in pairs)
|
||
return sum(x * w for x, w in pairs) / total
|
||
|
||
|
||
def to_float(s):
|
||
try:
|
||
return float(s)
|
||
except (TypeError, ValueError):
|
||
return None
|
||
|
||
|
||
def to_int(s):
|
||
try:
|
||
return int(float(s))
|
||
except (TypeError, ValueError):
|
||
return None
|
||
|
||
|
||
def main():
|
||
conn = connect()
|
||
cur = conn.cursor()
|
||
|
||
# DC-hosting tracts (the cost-bearing universe) ----------------------
|
||
cur.execute(
|
||
"""
|
||
select
|
||
geoid,
|
||
statefp,
|
||
data_center_count,
|
||
population,
|
||
households,
|
||
broadband_subscription_pct,
|
||
median_household_income,
|
||
per_capita_income,
|
||
poverty_rate,
|
||
non_hispanic_white_pct,
|
||
non_hispanic_black_pct,
|
||
hispanic_latino_pct,
|
||
non_hispanic_asian_pct,
|
||
primary_industry,
|
||
land_area_sqm,
|
||
industry_information_workers,
|
||
industry_total_workers
|
||
from public.data_center_census_tracts_2024
|
||
"""
|
||
)
|
||
dc_tracts = []
|
||
for r in cur.fetchall():
|
||
dc_tracts.append(
|
||
{
|
||
"geoid": r[0],
|
||
"statefp": r[1],
|
||
"dc_count": r[2] or 0,
|
||
"pop": r[3],
|
||
"hh": r[4],
|
||
"broadband_pct": float(r[5]) if r[5] is not None else None,
|
||
"mhi": r[6],
|
||
"pci": r[7],
|
||
"poverty": float(r[8]) if r[8] is not None else None,
|
||
"white_pct": float(r[9]) if r[9] is not None else None,
|
||
"black_pct": float(r[10]) if r[10] is not None else None,
|
||
"hisp_pct": float(r[11]) if r[11] is not None else None,
|
||
"asian_pct": float(r[12]) if r[12] is not None else None,
|
||
"primary_industry": r[13],
|
||
"land_sqm": r[14],
|
||
"info_workers": r[15],
|
||
"total_workers": r[16],
|
||
}
|
||
)
|
||
|
||
# Distance from each DC tract to nearest cable (km) ----------------
|
||
cur.execute(
|
||
"""
|
||
with cables as (select ST_Union(geom)::geography g from public.internet_cables)
|
||
select t.geoid,
|
||
ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
|
||
from public.data_center_census_tracts_2024 t, cables c
|
||
"""
|
||
)
|
||
dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
|
||
for t in dc_tracts:
|
||
t["dist_km"] = dist_by_geoid.get(t["geoid"])
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
# Comparison universe from the wider ACS CSV ------------------------
|
||
universe = []
|
||
with open(CSV_PATH, newline="", encoding="utf-8") as f:
|
||
for row in csv.DictReader(f):
|
||
universe.append(
|
||
{
|
||
"geoid": row["geoid"],
|
||
"statefp": row["statefp"],
|
||
"pop": to_int(row["population"]),
|
||
"broadband_pct": to_float(row["broadband_subscription_pct"]),
|
||
"mhi": to_int(row["median_household_income"]),
|
||
"pci": to_int(row["per_capita_income"]),
|
||
"poverty": to_float(row["poverty_rate"]),
|
||
"white_pct": to_float(row["non_hispanic_white_pct"]),
|
||
"black_pct": to_float(row["non_hispanic_black_pct"]),
|
||
"hisp_pct": to_float(row["hispanic_latino_pct"]),
|
||
"asian_pct": to_float(row["non_hispanic_asian_pct"]),
|
||
}
|
||
)
|
||
|
||
dc_geoids = {t["geoid"] for t in dc_tracts}
|
||
non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
|
||
|
||
# Restrict comparison to states actually represented in the DC sample
|
||
dc_states = {t["statefp"] for t in dc_tracts}
|
||
universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
|
||
non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
|
||
|
||
# ============== report ==============
|
||
print("=" * 72)
|
||
print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
|
||
print("=" * 72)
|
||
|
||
total_dc = sum(t["dc_count"] for t in dc_tracts)
|
||
print(f"\nDC-hosting tracts: {len(dc_tracts):,}")
|
||
print(f"Data centers in those tracts: {total_dc:,}")
|
||
print(f"ACS universe (selected states): {len(universe):,} tracts")
|
||
print(f"States represented in DC sample: {len(dc_states)}")
|
||
print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts")
|
||
|
||
# --- Cost concentration at the tract level ---
|
||
print("\n" + "-" * 72)
|
||
print("1. COST CONCENTRATION (DCs across tracts)")
|
||
print("-" * 72)
|
||
counts = [t["dc_count"] for t in dc_tracts]
|
||
shares = [c / total_dc for c in counts]
|
||
g_dc = gini(counts)
|
||
h_dc = hhi(shares)
|
||
print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}")
|
||
print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}")
|
||
# Top 1% / 5% of tracts share
|
||
top1 = max(1, len(counts) // 100)
|
||
top5 = max(1, len(counts) // 20)
|
||
s = sorted(counts, reverse=True)
|
||
print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold "
|
||
f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
|
||
print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold "
|
||
f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
|
||
print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
|
||
f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
|
||
|
||
# How small a fraction of population lives in a DC tract?
|
||
pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
|
||
pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
|
||
print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}")
|
||
print(f"Total population (DC-states ACS universe): {pop_universe:>11,}")
|
||
if pop_universe:
|
||
print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
|
||
f"live in a DC-hosting tract")
|
||
# Per-capita DC density
|
||
if pop_dc:
|
||
print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
|
||
if pop_universe and total_dc:
|
||
print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
|
||
f"averaged across DC-state population")
|
||
|
||
# --- Profile of cost-bearing communities ---
|
||
print("\n" + "-" * 72)
|
||
print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
|
||
print("-" * 72)
|
||
fields = [
|
||
("Median household income ($)", "mhi", "{:>10,.0f}"),
|
||
("Per-capita income ($)", "pci", "{:>10,.0f}"),
|
||
("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"),
|
||
("Poverty rate (%)", "poverty", "{:>10,.1f}"),
|
||
("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"),
|
||
("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"),
|
||
("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"),
|
||
("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"),
|
||
]
|
||
label_w = max(len(lbl) for lbl, *_ in fields)
|
||
print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} "
|
||
f"{'Δ (DC − peer)':>15}")
|
||
for label, key, fmt in fields:
|
||
dc_med = median([t[key] for t in dc_tracts])
|
||
peer_med = median([u[key] for u in non_dc_in_dc_states])
|
||
if dc_med is None or peer_med is None:
|
||
continue
|
||
delta = dc_med - peer_med
|
||
cell_dc = fmt.format(dc_med)
|
||
cell_pe = fmt.format(peer_med)
|
||
cell_dl = fmt.format(delta)
|
||
print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}")
|
||
|
||
print("\nPopulation-weighted means (DC tracts):")
|
||
pops = [t["pop"] for t in dc_tracts]
|
||
for label, key, _ in fields:
|
||
wm = wmean([t[key] for t in dc_tracts], pops)
|
||
if wm is not None:
|
||
print(f" {label:<{label_w}} {wm:>12,.1f}")
|
||
|
||
print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
|
||
for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
|
||
print(f" {n:>4} {industry}")
|
||
|
||
# --- Cable vs. inland subgroups ---
|
||
print("\n" + "-" * 72)
|
||
print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
|
||
print("-" * 72)
|
||
near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
|
||
far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
|
||
print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
|
||
f"{sum(t['dc_count'] for t in near):>4} DCs")
|
||
print(f">100 km from a submarine cable: {len(far):>3} tracts, "
|
||
f"{sum(t['dc_count'] for t in far):>4} DCs")
|
||
if near and far:
|
||
print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} "
|
||
f"far={median([t['mhi'] for t in far]):>10,.0f}")
|
||
print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} "
|
||
f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
|
||
print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} "
|
||
f"far={median([t['dc_count'] for t in far]):>10,.0f}")
|
||
|
||
# --- Benefit-side proxy ---
|
||
print("\n" + "-" * 72)
|
||
print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
|
||
print("-" * 72)
|
||
# Total broadband subscribers approx = households * broadband_pct
|
||
subs = []
|
||
for u in universe_in_dc_states:
|
||
hh = None # households not in CSV; use population/2.5 fallback
|
||
if u["pop"] and u["broadband_pct"] is not None:
|
||
est_hh = u["pop"] / 2.5
|
||
subs.append(est_hh * u["broadband_pct"] / 100.0)
|
||
total_subs = sum(subs)
|
||
sg = gini(subs)
|
||
sh = hhi([s / total_subs for s in subs]) if total_subs else None
|
||
print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}")
|
||
print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}")
|
||
print(f"HHI of subscribers across tracts: {sh:.5f}")
|
||
# Compare to DC HHI
|
||
print(f"\nSide-by-side concentration (lower = more dispersed):")
|
||
print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}")
|
||
print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} "
|
||
f"({h_dc/sh:.0f}x more concentrated for DCs)")
|
||
|
||
print("\n" + "=" * 72)
|
||
print("BOTTOM LINE")
|
||
print("=" * 72)
|
||
n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
|
||
print(f"""
|
||
- DCs are extremely concentrated at the tract level: top 1% of host tracts
|
||
hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
|
||
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
|
||
live in a DC-hosting tract — costs (land use, power draw, water, traffic,
|
||
noise) fall on a tiny minority of communities.
|
||
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
|
||
tracts. See deltas above for the demographic profile.
|
||
- Broadband subscribers (proxy for who consumes cloud services) are far more
|
||
evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
|
||
That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
|
||
""")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|