Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
237 lines
8.0 KiB
Python
237 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Quick statistical analysis: are US data centers spatially tied to submarine
|
|
cables, and does the resulting pattern look like concentrated costs / dispersed
|
|
benefits?
|
|
"""
|
|
import math
|
|
import os
|
|
import statistics
|
|
from collections import Counter
|
|
|
|
import psycopg2
|
|
|
|
|
|
def connect():
|
|
return psycopg2.connect(
|
|
host=os.environ["PGWEB_HOST"],
|
|
port=os.environ["PGWEB_PORT"],
|
|
user=os.environ["PGWEB_USER"],
|
|
password=os.environ["PGWEB_PASSWORD"],
|
|
dbname="data_centers",
|
|
)
|
|
|
|
|
|
def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
|
|
s = sorted(xs)
|
|
n = len(s)
|
|
out = {}
|
|
for q in qs:
|
|
if n == 0:
|
|
out[q] = None
|
|
continue
|
|
k = (n - 1) * q
|
|
lo, hi = math.floor(k), math.ceil(k)
|
|
if lo == hi:
|
|
out[q] = s[int(k)]
|
|
else:
|
|
out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
|
|
return out
|
|
|
|
|
|
def gini(values):
|
|
"""Standard Gini coefficient for non-negative values."""
|
|
v = sorted(x for x in values if x is not None and x >= 0)
|
|
n = len(v)
|
|
if n == 0 or sum(v) == 0:
|
|
return None
|
|
cum = 0.0
|
|
for i, x in enumerate(v, 1):
|
|
cum += i * x
|
|
return (2 * cum) / (n * sum(v)) - (n + 1) / n
|
|
|
|
|
|
def hhi(shares):
|
|
"""Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
|
|
return sum(s * s for s in shares)
|
|
|
|
|
|
def mann_whitney_u_z(xs, ys):
|
|
"""Approximate Mann-Whitney U test z-score (normal approx, large-n).
|
|
Returns (U, z, p_two_sided). Uses average ranks for ties.
|
|
"""
|
|
combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
|
|
combined.sort(key=lambda t: t[0])
|
|
ranks = [0.0] * len(combined)
|
|
i = 0
|
|
n = len(combined)
|
|
while i < n:
|
|
j = i
|
|
while j + 1 < n and combined[j + 1][0] == combined[i][0]:
|
|
j += 1
|
|
avg_rank = (i + j) / 2 + 1
|
|
for k in range(i, j + 1):
|
|
ranks[k] = avg_rank
|
|
i = j + 1
|
|
r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
|
|
n1, n2 = len(xs), len(ys)
|
|
U1 = r1 - n1 * (n1 + 1) / 2
|
|
mu = n1 * n2 / 2
|
|
sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
|
|
z = (U1 - mu) / sigma if sigma else 0.0
|
|
# Two-sided p via error function
|
|
p = math.erfc(abs(z) / math.sqrt(2))
|
|
return U1, z, p
|
|
|
|
|
|
def main():
|
|
conn = connect()
|
|
cur = conn.cursor()
|
|
|
|
# --- 1. Distance from each US data center to nearest submarine cable ---
|
|
cur.execute(
|
|
"""
|
|
with cables_union as (
|
|
select ST_Union(geom)::geography as g from public.internet_cables
|
|
)
|
|
select ST_Distance(
|
|
ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
|
|
cu.g
|
|
) / 1000.0 -- meters -> km
|
|
from public.us_dc_sample_geocoded dc, cables_union cu
|
|
where dc.longitude is not null and dc.latitude is not null
|
|
and (dc.country = 'United States' or dc.country is null)
|
|
"""
|
|
)
|
|
dc_km = [float(r[0]) for r in cur.fetchall()]
|
|
|
|
# --- 2. Distance from US city-dominance points to nearest cable ---
|
|
cur.execute(
|
|
"""
|
|
with cables_union as (
|
|
select ST_Union(geom)::geography as g from public.internet_cables
|
|
)
|
|
select ST_Distance(
|
|
ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
|
|
cu.g
|
|
) / 1000.0
|
|
from public.internet_city_dominance c, cables_union cu
|
|
where c.country = 'US' and c.geom is not null
|
|
"""
|
|
)
|
|
city_km = [float(r[0]) for r in cur.fetchall()]
|
|
|
|
# --- 3. DC distribution by state (cost concentration) ---
|
|
cur.execute(
|
|
"""
|
|
select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
|
|
from public.us_dc_sample_geocoded
|
|
where longitude is not null and latitude is not null
|
|
group by 1
|
|
"""
|
|
)
|
|
state_counts = dict(cur.fetchall())
|
|
total_dc = sum(state_counts.values())
|
|
state_shares = {k: v / total_dc for k, v in state_counts.items()}
|
|
|
|
# --- 4. IP distribution across US cities (benefit dispersion proxy) ---
|
|
cur.execute(
|
|
"""
|
|
select city, coalesce(logical_dominance_ips, 0)
|
|
from public.internet_city_dominance
|
|
where country = 'US' and logical_dominance_ips is not null
|
|
"""
|
|
)
|
|
city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
|
|
total_ips = sum(v for _, v in city_ips)
|
|
ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
|
|
|
|
# --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
|
|
# Top-N US dominance cities, share of national IPs each captures.
|
|
top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
# ======= report =======
|
|
print("=" * 70)
|
|
print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
|
|
print("=" * 70)
|
|
print(f"\nN data centers analyzed: {len(dc_km):,}")
|
|
print(f"N US city-dominance pts: {len(city_km):,}")
|
|
|
|
def fmt_q(label, xs):
|
|
q = quantiles(xs)
|
|
print(f"\n{label}:")
|
|
print(f" mean = {statistics.mean(xs):,.1f} km")
|
|
print(f" median (p50) = {q[0.5]:,.1f} km")
|
|
print(f" p10 / p25 / p75 / p90 = "
|
|
f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
|
|
for thr in (10, 50, 100, 250):
|
|
frac = sum(1 for x in xs if x <= thr) / len(xs)
|
|
print(f" share within {thr:>3} km of a cable: {frac*100:5.1f}%")
|
|
|
|
fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
|
|
fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
|
|
|
|
U, z, p = mann_whitney_u_z(dc_km, city_km)
|
|
print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
|
|
f"p≈{p:.2e}")
|
|
if statistics.median(dc_km) < statistics.median(city_km):
|
|
diff = statistics.median(city_km) - statistics.median(dc_km)
|
|
print(f" -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
|
|
else:
|
|
print(" -> DCs are not closer to cables than cities.")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("CONCENTRATION OF COSTS (data centers by state)")
|
|
print("=" * 70)
|
|
g_dc = gini(list(state_counts.values()))
|
|
h_dc = hhi(list(state_shares.values()))
|
|
print(f"States covered: {len(state_counts)}")
|
|
print(f"Gini of DC counts across states: {g_dc:.3f} (0=even, 1=one state takes all)")
|
|
print(f"HHI of state shares: {h_dc:.3f} (0.18+ = highly concentrated)")
|
|
top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
|
|
cum = 0.0
|
|
print(f"\nTop states by share of US data centers:")
|
|
for st, s in top_states:
|
|
cum += s
|
|
print(f" {st}: {s*100:5.1f}% ({state_counts[st]:>4} DCs) cum={cum*100:5.1f}%")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("DISPERSION OF BENEFITS (US IPs across cities)")
|
|
print("=" * 70)
|
|
g_ip = gini([v for _, v in city_ips])
|
|
h_ip = hhi(ip_shares)
|
|
print(f"US cities with IP data: {len(city_ips):,}")
|
|
print(f"Gini of IPs across cities: {g_ip:.3f}")
|
|
print(f"HHI of IP shares: {h_ip:.3f}")
|
|
cum = 0.0
|
|
print(f"\nTop US cities by share of national IPs:")
|
|
for city, ips in top_ip_cities:
|
|
s = ips / total_ips
|
|
cum += s
|
|
print(f" {city:<30} {s*100:5.2f}% ({ips:>11,} IPs) cum={cum*100:5.2f}%")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("INTERPRETATION")
|
|
print("=" * 70)
|
|
print(f"""
|
|
Cost concentration (DCs across states): Gini={g_dc:.3f} HHI={h_dc:.3f}
|
|
Benefit dispersion (IPs across cities): Gini={g_ip:.3f} HHI={h_ip:.3f}
|
|
|
|
A "concentrated costs / dispersed benefits" pattern requires:
|
|
(1) DCs cluster in a few places (high state-level Gini/HHI).
|
|
(2) Users they serve span many places (low city-level Gini/HHI, ideally).
|
|
(3) That clustering is plausibly tied to fixed infrastructure (cables).
|
|
|
|
Check signs above:
|
|
- DC location vs cable proximity: see Mann-Whitney result.
|
|
- Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
|
|
- Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
|
|
the asymmetry (benefits more evenly distributed than costs).
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|