data-centers/scripts/analyze_cables_concentration.py

#!/usr/bin/env python3
"""Quick statistical analysis: are US data centers spatially tied to submarine
cables, and does the resulting pattern look like concentrated costs / dispersed
benefits?
"""
import math
import os
import statistics
from collections import Counter

import psycopg2


def connect():
    return psycopg2.connect(
        host=os.environ["PGWEB_HOST"],
        port=os.environ["PGWEB_PORT"],
        user=os.environ["PGWEB_USER"],
        password=os.environ["PGWEB_PASSWORD"],
        dbname="data_centers",
    )


def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
    s = sorted(xs)
    n = len(s)
    out = {}
    for q in qs:
        if n == 0:
            out[q] = None
            continue
        k = (n - 1) * q
        lo, hi = math.floor(k), math.ceil(k)
        if lo == hi:
            out[q] = s[int(k)]
        else:
            out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
    return out


def gini(values):
    """Standard Gini coefficient for non-negative values."""
    v = sorted(x for x in values if x is not None and x >= 0)
    n = len(v)
    if n == 0 or sum(v) == 0:
        return None
    cum = 0.0
    for i, x in enumerate(v, 1):
        cum += i * x
    return (2 * cum) / (n * sum(v)) - (n + 1) / n


def hhi(shares):
    """Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
    return sum(s * s for s in shares)


def mann_whitney_u_z(xs, ys):
    """Approximate Mann-Whitney U test z-score (normal approx, large-n).
    Returns (U, z, p_two_sided). Uses average ranks for ties.
    """
    combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
    combined.sort(key=lambda t: t[0])
    ranks = [0.0] * len(combined)
    i = 0
    n = len(combined)
    while i < n:
        j = i
        while j + 1 < n and combined[j + 1][0] == combined[i][0]:
            j += 1
        avg_rank = (i + j) / 2 + 1
        for k in range(i, j + 1):
            ranks[k] = avg_rank
        i = j + 1
    r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
    n1, n2 = len(xs), len(ys)
    U1 = r1 - n1 * (n1 + 1) / 2
    mu = n1 * n2 / 2
    sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
    z = (U1 - mu) / sigma if sigma else 0.0
    # Two-sided p via error function
    p = math.erfc(abs(z) / math.sqrt(2))
    return U1, z, p


def main():
    conn = connect()
    cur = conn.cursor()

    # --- 1. Distance from each US data center to nearest submarine cable ---
    cur.execute(
        """
        with cables_union as (
            select ST_Union(geom)::geography as g from public.internet_cables
        )
        select ST_Distance(
            ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
            cu.g
        ) / 1000.0  -- meters -> km
        from public.us_dc_sample_geocoded dc, cables_union cu
        where dc.longitude is not null and dc.latitude is not null
          and (dc.country = 'United States' or dc.country is null)
        """
    )
    dc_km = [float(r[0]) for r in cur.fetchall()]

    # --- 2. Distance from US city-dominance points to nearest cable ---
    cur.execute(
        """
        with cables_union as (
            select ST_Union(geom)::geography as g from public.internet_cables
        )
        select ST_Distance(
            ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
            cu.g
        ) / 1000.0
        from public.internet_city_dominance c, cables_union cu
        where c.country = 'US' and c.geom is not null
        """
    )
    city_km = [float(r[0]) for r in cur.fetchall()]

    # --- 3. DC distribution by state (cost concentration) ---
    cur.execute(
        """
        select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
        from public.us_dc_sample_geocoded
        where longitude is not null and latitude is not null
        group by 1
        """
    )
    state_counts = dict(cur.fetchall())
    total_dc = sum(state_counts.values())
    state_shares = {k: v / total_dc for k, v in state_counts.items()}

    # --- 4. IP distribution across US cities (benefit dispersion proxy) ---
    cur.execute(
        """
        select city, coalesce(logical_dominance_ips, 0)
        from public.internet_city_dominance
        where country = 'US' and logical_dominance_ips is not null
        """
    )
    city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
    total_ips = sum(v for _, v in city_ips)
    ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []

    # --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
    # Top-N US dominance cities, share of national IPs each captures.
    top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]

    cur.close()
    conn.close()

    # ======= report =======
    print("=" * 70)
    print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
    print("=" * 70)
    print(f"\nN data centers analyzed: {len(dc_km):,}")
    print(f"N US city-dominance pts: {len(city_km):,}")

    def fmt_q(label, xs):
        q = quantiles(xs)
        print(f"\n{label}:")
        print(f"  mean = {statistics.mean(xs):,.1f} km")
        print(f"  median (p50) = {q[0.5]:,.1f} km")
        print(f"  p10 / p25 / p75 / p90 = "
              f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
        for thr in (10, 50, 100, 250):
            frac = sum(1 for x in xs if x <= thr) / len(xs)
            print(f"  share within {thr:>3} km of a cable: {frac*100:5.1f}%")

    fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
    fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)

    U, z, p = mann_whitney_u_z(dc_km, city_km)
    print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
          f"p≈{p:.2e}")
    if statistics.median(dc_km) < statistics.median(city_km):
        diff = statistics.median(city_km) - statistics.median(dc_km)
        print(f"  -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
    else:
        print("  -> DCs are not closer to cables than cities.")

    print("\n" + "=" * 70)
    print("CONCENTRATION OF COSTS (data centers by state)")
    print("=" * 70)
    g_dc = gini(list(state_counts.values()))
    h_dc = hhi(list(state_shares.values()))
    print(f"States covered: {len(state_counts)}")
    print(f"Gini of DC counts across states: {g_dc:.3f}  (0=even, 1=one state takes all)")
    print(f"HHI of state shares:              {h_dc:.3f}  (0.18+ = highly concentrated)")
    top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
    cum = 0.0
    print(f"\nTop states by share of US data centers:")
    for st, s in top_states:
        cum += s
        print(f"  {st}: {s*100:5.1f}%  ({state_counts[st]:>4} DCs)  cum={cum*100:5.1f}%")

    print("\n" + "=" * 70)
    print("DISPERSION OF BENEFITS (US IPs across cities)")
    print("=" * 70)
    g_ip = gini([v for _, v in city_ips])
    h_ip = hhi(ip_shares)
    print(f"US cities with IP data: {len(city_ips):,}")
    print(f"Gini of IPs across cities: {g_ip:.3f}")
    print(f"HHI of IP shares:          {h_ip:.3f}")
    cum = 0.0
    print(f"\nTop US cities by share of national IPs:")
    for city, ips in top_ip_cities:
        s = ips / total_ips
        cum += s
        print(f"  {city:<30}  {s*100:5.2f}%  ({ips:>11,} IPs)  cum={cum*100:5.2f}%")

    print("\n" + "=" * 70)
    print("INTERPRETATION")
    print("=" * 70)
    print(f"""
Cost concentration (DCs across states):   Gini={g_dc:.3f}  HHI={h_dc:.3f}
Benefit dispersion (IPs across cities):   Gini={g_ip:.3f}  HHI={h_ip:.3f}

A "concentrated costs / dispersed benefits" pattern requires:
  (1) DCs cluster in a few places (high state-level Gini/HHI).
  (2) Users they serve span many places (low city-level Gini/HHI, ideally).
  (3) That clustering is plausibly tied to fixed infrastructure (cables).

Check signs above:
  - DC location vs cable proximity: see Mann-Whitney result.
  - Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
  - Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
    the asymmetry (benefits more evenly distributed than costs).
""")


if __name__ == "__main__":
    main()