#!/usr/bin/env python3 """Quick statistical analysis: are US data centers spatially tied to submarine cables, and does the resulting pattern look like concentrated costs / dispersed benefits? """ import math import os import statistics from collections import Counter import psycopg2 def connect(): return psycopg2.connect( host=os.environ["PGWEB_HOST"], port=os.environ["PGWEB_PORT"], user=os.environ["PGWEB_USER"], password=os.environ["PGWEB_PASSWORD"], dbname="data_centers", ) def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)): s = sorted(xs) n = len(s) out = {} for q in qs: if n == 0: out[q] = None continue k = (n - 1) * q lo, hi = math.floor(k), math.ceil(k) if lo == hi: out[q] = s[int(k)] else: out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo) return out def gini(values): """Standard Gini coefficient for non-negative values.""" v = sorted(x for x in values if x is not None and x >= 0) n = len(v) if n == 0 or sum(v) == 0: return None cum = 0.0 for i, x in enumerate(v, 1): cum += i * x return (2 * cum) / (n * sum(v)) - (n + 1) / n def hhi(shares): """Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1].""" return sum(s * s for s in shares) def mann_whitney_u_z(xs, ys): """Approximate Mann-Whitney U test z-score (normal approx, large-n). Returns (U, z, p_two_sided). Uses average ranks for ties. """ combined = [(v, 0) for v in xs] + [(v, 1) for v in ys] combined.sort(key=lambda t: t[0]) ranks = [0.0] * len(combined) i = 0 n = len(combined) while i < n: j = i while j + 1 < n and combined[j + 1][0] == combined[i][0]: j += 1 avg_rank = (i + j) / 2 + 1 for k in range(i, j + 1): ranks[k] = avg_rank i = j + 1 r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0) n1, n2 = len(xs), len(ys) U1 = r1 - n1 * (n1 + 1) / 2 mu = n1 * n2 / 2 sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12) z = (U1 - mu) / sigma if sigma else 0.0 # Two-sided p via error function p = math.erfc(abs(z) / math.sqrt(2)) return U1, z, p def main(): conn = connect() cur = conn.cursor() # --- 1. Distance from each US data center to nearest submarine cable --- cur.execute( """ with cables_union as ( select ST_Union(geom)::geography as g from public.internet_cables ) select ST_Distance( ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography, cu.g ) / 1000.0 -- meters -> km from public.us_dc_sample_geocoded dc, cables_union cu where dc.longitude is not null and dc.latitude is not null and (dc.country = 'United States' or dc.country is null) """ ) dc_km = [float(r[0]) for r in cur.fetchall()] # --- 2. Distance from US city-dominance points to nearest cable --- cur.execute( """ with cables_union as ( select ST_Union(geom)::geography as g from public.internet_cables ) select ST_Distance( ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography, cu.g ) / 1000.0 from public.internet_city_dominance c, cables_union cu where c.country = 'US' and c.geom is not null """ ) city_km = [float(r[0]) for r in cur.fetchall()] # --- 3. DC distribution by state (cost concentration) --- cur.execute( """ select coalesce(nullif(state_code, ''), 'UNK') as st, count(*) from public.us_dc_sample_geocoded where longitude is not null and latitude is not null group by 1 """ ) state_counts = dict(cur.fetchall()) total_dc = sum(state_counts.values()) state_shares = {k: v / total_dc for k, v in state_counts.items()} # --- 4. IP distribution across US cities (benefit dispersion proxy) --- cur.execute( """ select city, coalesce(logical_dominance_ips, 0) from public.internet_city_dominance where country = 'US' and logical_dominance_ips is not null """ ) city_ips = [(r[0], int(r[1])) for r in cur.fetchall()] total_ips = sum(v for _, v in city_ips) ip_shares = [v / total_ips for _, v in city_ips] if total_ips else [] # --- 5. Where do the people-with-IPs LIVE relative to the DCs? --- # Top-N US dominance cities, share of national IPs each captures. top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10] cur.close() conn.close() # ======= report ======= print("=" * 70) print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?") print("=" * 70) print(f"\nN data centers analyzed: {len(dc_km):,}") print(f"N US city-dominance pts: {len(city_km):,}") def fmt_q(label, xs): q = quantiles(xs) print(f"\n{label}:") print(f" mean = {statistics.mean(xs):,.1f} km") print(f" median (p50) = {q[0.5]:,.1f} km") print(f" p10 / p25 / p75 / p90 = " f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km") for thr in (10, 50, 100, 250): frac = sum(1 for x in xs if x <= thr) / len(xs) print(f" share within {thr:>3} km of a cable: {frac*100:5.1f}%") fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km) fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km) U, z, p = mann_whitney_u_z(dc_km, city_km) print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, " f"p≈{p:.2e}") if statistics.median(dc_km) < statistics.median(city_km): diff = statistics.median(city_km) - statistics.median(dc_km) print(f" -> DC median is {diff:,.1f} km CLOSER to cables than city median.") else: print(" -> DCs are not closer to cables than cities.") print("\n" + "=" * 70) print("CONCENTRATION OF COSTS (data centers by state)") print("=" * 70) g_dc = gini(list(state_counts.values())) h_dc = hhi(list(state_shares.values())) print(f"States covered: {len(state_counts)}") print(f"Gini of DC counts across states: {g_dc:.3f} (0=even, 1=one state takes all)") print(f"HHI of state shares: {h_dc:.3f} (0.18+ = highly concentrated)") top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8] cum = 0.0 print(f"\nTop states by share of US data centers:") for st, s in top_states: cum += s print(f" {st}: {s*100:5.1f}% ({state_counts[st]:>4} DCs) cum={cum*100:5.1f}%") print("\n" + "=" * 70) print("DISPERSION OF BENEFITS (US IPs across cities)") print("=" * 70) g_ip = gini([v for _, v in city_ips]) h_ip = hhi(ip_shares) print(f"US cities with IP data: {len(city_ips):,}") print(f"Gini of IPs across cities: {g_ip:.3f}") print(f"HHI of IP shares: {h_ip:.3f}") cum = 0.0 print(f"\nTop US cities by share of national IPs:") for city, ips in top_ip_cities: s = ips / total_ips cum += s print(f" {city:<30} {s*100:5.2f}% ({ips:>11,} IPs) cum={cum*100:5.2f}%") print("\n" + "=" * 70) print("INTERPRETATION") print("=" * 70) print(f""" Cost concentration (DCs across states): Gini={g_dc:.3f} HHI={h_dc:.3f} Benefit dispersion (IPs across cities): Gini={g_ip:.3f} HHI={h_ip:.3f} A "concentrated costs / dispersed benefits" pattern requires: (1) DCs cluster in a few places (high state-level Gini/HHI). (2) Users they serve span many places (low city-level Gini/HHI, ideally). (3) That clustering is plausibly tied to fixed infrastructure (cables). Check signs above: - DC location vs cable proximity: see Mann-Whitney result. - Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10). - Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate the asymmetry (benefits more evenly distributed than costs). """) if __name__ == "__main__": main()