Reorganize project into scripts/, docs/, data/, output/ directories
Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
320
scripts/analyze_dc_tract_concentration.py
Normal file
320
scripts/analyze_dc_tract_concentration.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
|
||||
for US data-center siting.
|
||||
|
||||
Cost-bearing universe = tracts that host at least one DC
|
||||
(public.data_center_census_tracts_2024)
|
||||
Comparison universe = ACS 2024 5-yr tracts in the selected states
|
||||
(census_tract_acs_2024_selected_states.csv)
|
||||
"""
|
||||
import csv
|
||||
import math
|
||||
import os
|
||||
import statistics
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
CSV_PATH = "census_tract_acs_2024_selected_states.csv"
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname="data_centers",
|
||||
)
|
||||
|
||||
|
||||
def gini(values):
|
||||
v = sorted(x for x in values if x is not None and x >= 0)
|
||||
n = len(v)
|
||||
if n == 0 or sum(v) == 0:
|
||||
return None
|
||||
cum = sum(i * x for i, x in enumerate(v, 1))
|
||||
return (2 * cum) / (n * sum(v)) - (n + 1) / n
|
||||
|
||||
|
||||
def hhi(shares):
|
||||
return sum(s * s for s in shares)
|
||||
|
||||
|
||||
def median(xs):
|
||||
xs = [x for x in xs if x is not None]
|
||||
return statistics.median(xs) if xs else None
|
||||
|
||||
|
||||
def mean(xs):
|
||||
xs = [x for x in xs if x is not None]
|
||||
return statistics.mean(xs) if xs else None
|
||||
|
||||
|
||||
def wmean(xs, ws):
|
||||
pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
|
||||
if not pairs:
|
||||
return None
|
||||
total = sum(w for _, w in pairs)
|
||||
return sum(x * w for x, w in pairs) / total
|
||||
|
||||
|
||||
def to_float(s):
|
||||
try:
|
||||
return float(s)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_int(s):
|
||||
try:
|
||||
return int(float(s))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
conn = connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# DC-hosting tracts (the cost-bearing universe) ----------------------
|
||||
cur.execute(
|
||||
"""
|
||||
select
|
||||
geoid,
|
||||
statefp,
|
||||
data_center_count,
|
||||
population,
|
||||
households,
|
||||
broadband_subscription_pct,
|
||||
median_household_income,
|
||||
per_capita_income,
|
||||
poverty_rate,
|
||||
non_hispanic_white_pct,
|
||||
non_hispanic_black_pct,
|
||||
hispanic_latino_pct,
|
||||
non_hispanic_asian_pct,
|
||||
primary_industry,
|
||||
land_area_sqm,
|
||||
industry_information_workers,
|
||||
industry_total_workers
|
||||
from public.data_center_census_tracts_2024
|
||||
"""
|
||||
)
|
||||
dc_tracts = []
|
||||
for r in cur.fetchall():
|
||||
dc_tracts.append(
|
||||
{
|
||||
"geoid": r[0],
|
||||
"statefp": r[1],
|
||||
"dc_count": r[2] or 0,
|
||||
"pop": r[3],
|
||||
"hh": r[4],
|
||||
"broadband_pct": float(r[5]) if r[5] is not None else None,
|
||||
"mhi": r[6],
|
||||
"pci": r[7],
|
||||
"poverty": float(r[8]) if r[8] is not None else None,
|
||||
"white_pct": float(r[9]) if r[9] is not None else None,
|
||||
"black_pct": float(r[10]) if r[10] is not None else None,
|
||||
"hisp_pct": float(r[11]) if r[11] is not None else None,
|
||||
"asian_pct": float(r[12]) if r[12] is not None else None,
|
||||
"primary_industry": r[13],
|
||||
"land_sqm": r[14],
|
||||
"info_workers": r[15],
|
||||
"total_workers": r[16],
|
||||
}
|
||||
)
|
||||
|
||||
# Distance from each DC tract to nearest cable (km) ----------------
|
||||
cur.execute(
|
||||
"""
|
||||
with cables as (select ST_Union(geom)::geography g from public.internet_cables)
|
||||
select t.geoid,
|
||||
ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
|
||||
from public.data_center_census_tracts_2024 t, cables c
|
||||
"""
|
||||
)
|
||||
dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
|
||||
for t in dc_tracts:
|
||||
t["dist_km"] = dist_by_geoid.get(t["geoid"])
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# Comparison universe from the wider ACS CSV ------------------------
|
||||
universe = []
|
||||
with open(CSV_PATH, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
universe.append(
|
||||
{
|
||||
"geoid": row["geoid"],
|
||||
"statefp": row["statefp"],
|
||||
"pop": to_int(row["population"]),
|
||||
"broadband_pct": to_float(row["broadband_subscription_pct"]),
|
||||
"mhi": to_int(row["median_household_income"]),
|
||||
"pci": to_int(row["per_capita_income"]),
|
||||
"poverty": to_float(row["poverty_rate"]),
|
||||
"white_pct": to_float(row["non_hispanic_white_pct"]),
|
||||
"black_pct": to_float(row["non_hispanic_black_pct"]),
|
||||
"hisp_pct": to_float(row["hispanic_latino_pct"]),
|
||||
"asian_pct": to_float(row["non_hispanic_asian_pct"]),
|
||||
}
|
||||
)
|
||||
|
||||
dc_geoids = {t["geoid"] for t in dc_tracts}
|
||||
non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
|
||||
|
||||
# Restrict comparison to states actually represented in the DC sample
|
||||
dc_states = {t["statefp"] for t in dc_tracts}
|
||||
universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
|
||||
non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
|
||||
|
||||
# ============== report ==============
|
||||
print("=" * 72)
|
||||
print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
|
||||
print("=" * 72)
|
||||
|
||||
total_dc = sum(t["dc_count"] for t in dc_tracts)
|
||||
print(f"\nDC-hosting tracts: {len(dc_tracts):,}")
|
||||
print(f"Data centers in those tracts: {total_dc:,}")
|
||||
print(f"ACS universe (selected states): {len(universe):,} tracts")
|
||||
print(f"States represented in DC sample: {len(dc_states)}")
|
||||
print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts")
|
||||
|
||||
# --- Cost concentration at the tract level ---
|
||||
print("\n" + "-" * 72)
|
||||
print("1. COST CONCENTRATION (DCs across tracts)")
|
||||
print("-" * 72)
|
||||
counts = [t["dc_count"] for t in dc_tracts]
|
||||
shares = [c / total_dc for c in counts]
|
||||
g_dc = gini(counts)
|
||||
h_dc = hhi(shares)
|
||||
print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}")
|
||||
print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}")
|
||||
# Top 1% / 5% of tracts share
|
||||
top1 = max(1, len(counts) // 100)
|
||||
top5 = max(1, len(counts) // 20)
|
||||
s = sorted(counts, reverse=True)
|
||||
print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold "
|
||||
f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
|
||||
print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold "
|
||||
f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
|
||||
print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
|
||||
f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
|
||||
|
||||
# How small a fraction of population lives in a DC tract?
|
||||
pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
|
||||
pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
|
||||
print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}")
|
||||
print(f"Total population (DC-states ACS universe): {pop_universe:>11,}")
|
||||
if pop_universe:
|
||||
print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
|
||||
f"live in a DC-hosting tract")
|
||||
# Per-capita DC density
|
||||
if pop_dc:
|
||||
print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
|
||||
if pop_universe and total_dc:
|
||||
print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
|
||||
f"averaged across DC-state population")
|
||||
|
||||
# --- Profile of cost-bearing communities ---
|
||||
print("\n" + "-" * 72)
|
||||
print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
|
||||
print("-" * 72)
|
||||
fields = [
|
||||
("Median household income ($)", "mhi", "{:>10,.0f}"),
|
||||
("Per-capita income ($)", "pci", "{:>10,.0f}"),
|
||||
("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"),
|
||||
("Poverty rate (%)", "poverty", "{:>10,.1f}"),
|
||||
("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"),
|
||||
("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"),
|
||||
("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"),
|
||||
("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"),
|
||||
]
|
||||
label_w = max(len(lbl) for lbl, *_ in fields)
|
||||
print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} "
|
||||
f"{'Δ (DC − peer)':>15}")
|
||||
for label, key, fmt in fields:
|
||||
dc_med = median([t[key] for t in dc_tracts])
|
||||
peer_med = median([u[key] for u in non_dc_in_dc_states])
|
||||
if dc_med is None or peer_med is None:
|
||||
continue
|
||||
delta = dc_med - peer_med
|
||||
cell_dc = fmt.format(dc_med)
|
||||
cell_pe = fmt.format(peer_med)
|
||||
cell_dl = fmt.format(delta)
|
||||
print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}")
|
||||
|
||||
print("\nPopulation-weighted means (DC tracts):")
|
||||
pops = [t["pop"] for t in dc_tracts]
|
||||
for label, key, _ in fields:
|
||||
wm = wmean([t[key] for t in dc_tracts], pops)
|
||||
if wm is not None:
|
||||
print(f" {label:<{label_w}} {wm:>12,.1f}")
|
||||
|
||||
print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
|
||||
for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
|
||||
print(f" {n:>4} {industry}")
|
||||
|
||||
# --- Cable vs. inland subgroups ---
|
||||
print("\n" + "-" * 72)
|
||||
print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
|
||||
print("-" * 72)
|
||||
near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
|
||||
far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
|
||||
print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
|
||||
f"{sum(t['dc_count'] for t in near):>4} DCs")
|
||||
print(f">100 km from a submarine cable: {len(far):>3} tracts, "
|
||||
f"{sum(t['dc_count'] for t in far):>4} DCs")
|
||||
if near and far:
|
||||
print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} "
|
||||
f"far={median([t['mhi'] for t in far]):>10,.0f}")
|
||||
print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} "
|
||||
f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
|
||||
print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} "
|
||||
f"far={median([t['dc_count'] for t in far]):>10,.0f}")
|
||||
|
||||
# --- Benefit-side proxy ---
|
||||
print("\n" + "-" * 72)
|
||||
print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
|
||||
print("-" * 72)
|
||||
# Total broadband subscribers approx = households * broadband_pct
|
||||
subs = []
|
||||
for u in universe_in_dc_states:
|
||||
hh = None # households not in CSV; use population/2.5 fallback
|
||||
if u["pop"] and u["broadband_pct"] is not None:
|
||||
est_hh = u["pop"] / 2.5
|
||||
subs.append(est_hh * u["broadband_pct"] / 100.0)
|
||||
total_subs = sum(subs)
|
||||
sg = gini(subs)
|
||||
sh = hhi([s / total_subs for s in subs]) if total_subs else None
|
||||
print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}")
|
||||
print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}")
|
||||
print(f"HHI of subscribers across tracts: {sh:.5f}")
|
||||
# Compare to DC HHI
|
||||
print(f"\nSide-by-side concentration (lower = more dispersed):")
|
||||
print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}")
|
||||
print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} "
|
||||
f"({h_dc/sh:.0f}x more concentrated for DCs)")
|
||||
|
||||
print("\n" + "=" * 72)
|
||||
print("BOTTOM LINE")
|
||||
print("=" * 72)
|
||||
n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
|
||||
print(f"""
|
||||
- DCs are extremely concentrated at the tract level: top 1% of host tracts
|
||||
hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
|
||||
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
|
||||
live in a DC-hosting tract — costs (land use, power draw, water, traffic,
|
||||
noise) fall on a tiny minority of communities.
|
||||
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
|
||||
tracts. See deltas above for the demographic profile.
|
||||
- Broadband subscribers (proxy for who consumes cloud services) are far more
|
||||
evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
|
||||
That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user