Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions
--- a/scripts/analyze_cables_concentration.py
+++ b/scripts/analyze_cables_concentration.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+"""Quick statistical analysis: are US data centers spatially tied to submarine
+cables, and does the resulting pattern look like concentrated costs / dispersed
+benefits?
+"""
+import math
+import os
+import statistics
+from collections import Counter
+
+import psycopg2
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
+    s = sorted(xs)
+    n = len(s)
+    out = {}
+    for q in qs:
+        if n == 0:
+            out[q] = None
+            continue
+        k = (n - 1) * q
+        lo, hi = math.floor(k), math.ceil(k)
+        if lo == hi:
+            out[q] = s[int(k)]
+        else:
+            out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
+    return out
+
+
+def gini(values):
+    """Standard Gini coefficient for non-negative values."""
+    v = sorted(x for x in values if x is not None and x >= 0)
+    n = len(v)
+    if n == 0 or sum(v) == 0:
+        return None
+    cum = 0.0
+    for i, x in enumerate(v, 1):
+        cum += i * x
+    return (2 * cum) / (n * sum(v)) - (n + 1) / n
+
+
+def hhi(shares):
+    """Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
+    return sum(s * s for s in shares)
+
+
+def mann_whitney_u_z(xs, ys):
+    """Approximate Mann-Whitney U test z-score (normal approx, large-n).
+    Returns (U, z, p_two_sided). Uses average ranks for ties.
+    """
+    combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
+    combined.sort(key=lambda t: t[0])
+    ranks = [0.0] * len(combined)
+    i = 0
+    n = len(combined)
+    while i < n:
+        j = i
+        while j + 1 < n and combined[j + 1][0] == combined[i][0]:
+            j += 1
+        avg_rank = (i + j) / 2 + 1
+        for k in range(i, j + 1):
+            ranks[k] = avg_rank
+        i = j + 1
+    r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
+    n1, n2 = len(xs), len(ys)
+    U1 = r1 - n1 * (n1 + 1) / 2
+    mu = n1 * n2 / 2
+    sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
+    z = (U1 - mu) / sigma if sigma else 0.0
+    # Two-sided p via error function
+    p = math.erfc(abs(z) / math.sqrt(2))
+    return U1, z, p
+
+
+def main():
+    conn = connect()
+    cur = conn.cursor()
+
+    # --- 1. Distance from each US data center to nearest submarine cable ---
+    cur.execute(
+        """
+        with cables_union as (
+            select ST_Union(geom)::geography as g from public.internet_cables
+        )
+        select ST_Distance(
+            ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
+            cu.g
+        ) / 1000.0  -- meters -> km
+        from public.us_dc_sample_geocoded dc, cables_union cu
+        where dc.longitude is not null and dc.latitude is not null
+          and (dc.country = 'United States' or dc.country is null)
+        """
+    )
+    dc_km = [float(r[0]) for r in cur.fetchall()]
+
+    # --- 2. Distance from US city-dominance points to nearest cable ---
+    cur.execute(
+        """
+        with cables_union as (
+            select ST_Union(geom)::geography as g from public.internet_cables
+        )
+        select ST_Distance(
+            ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
+            cu.g
+        ) / 1000.0
+        from public.internet_city_dominance c, cables_union cu
+        where c.country = 'US' and c.geom is not null
+        """
+    )
+    city_km = [float(r[0]) for r in cur.fetchall()]
+
+    # --- 3. DC distribution by state (cost concentration) ---
+    cur.execute(
+        """
+        select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
+        from public.us_dc_sample_geocoded
+        where longitude is not null and latitude is not null
+        group by 1
+        """
+    )
+    state_counts = dict(cur.fetchall())
+    total_dc = sum(state_counts.values())
+    state_shares = {k: v / total_dc for k, v in state_counts.items()}
+
+    # --- 4. IP distribution across US cities (benefit dispersion proxy) ---
+    cur.execute(
+        """
+        select city, coalesce(logical_dominance_ips, 0)
+        from public.internet_city_dominance
+        where country = 'US' and logical_dominance_ips is not null
+        """
+    )
+    city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
+    total_ips = sum(v for _, v in city_ips)
+    ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
+
+    # --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
+    # Top-N US dominance cities, share of national IPs each captures.
+    top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
+
+    cur.close()
+    conn.close()
+
+    # ======= report =======
+    print("=" * 70)
+    print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
+    print("=" * 70)
+    print(f"\nN data centers analyzed: {len(dc_km):,}")
+    print(f"N US city-dominance pts: {len(city_km):,}")
+
+    def fmt_q(label, xs):
+        q = quantiles(xs)
+        print(f"\n{label}:")
+        print(f"  mean = {statistics.mean(xs):,.1f} km")
+        print(f"  median (p50) = {q[0.5]:,.1f} km")
+        print(f"  p10 / p25 / p75 / p90 = "
+              f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
+        for thr in (10, 50, 100, 250):
+            frac = sum(1 for x in xs if x <= thr) / len(xs)
+            print(f"  share within {thr:>3} km of a cable: {frac*100:5.1f}%")
+
+    fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
+    fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
+
+    U, z, p = mann_whitney_u_z(dc_km, city_km)
+    print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
+          f"p≈{p:.2e}")
+    if statistics.median(dc_km) < statistics.median(city_km):
+        diff = statistics.median(city_km) - statistics.median(dc_km)
+        print(f"  -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
+    else:
+        print("  -> DCs are not closer to cables than cities.")
+
+    print("\n" + "=" * 70)
+    print("CONCENTRATION OF COSTS (data centers by state)")
+    print("=" * 70)
+    g_dc = gini(list(state_counts.values()))
+    h_dc = hhi(list(state_shares.values()))
+    print(f"States covered: {len(state_counts)}")
+    print(f"Gini of DC counts across states: {g_dc:.3f}  (0=even, 1=one state takes all)")
+    print(f"HHI of state shares:              {h_dc:.3f}  (0.18+ = highly concentrated)")
+    top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
+    cum = 0.0
+    print(f"\nTop states by share of US data centers:")
+    for st, s in top_states:
+        cum += s
+        print(f"  {st}: {s*100:5.1f}%  ({state_counts[st]:>4} DCs)  cum={cum*100:5.1f}%")
+
+    print("\n" + "=" * 70)
+    print("DISPERSION OF BENEFITS (US IPs across cities)")
+    print("=" * 70)
+    g_ip = gini([v for _, v in city_ips])
+    h_ip = hhi(ip_shares)
+    print(f"US cities with IP data: {len(city_ips):,}")
+    print(f"Gini of IPs across cities: {g_ip:.3f}")
+    print(f"HHI of IP shares:          {h_ip:.3f}")
+    cum = 0.0
+    print(f"\nTop US cities by share of national IPs:")
+    for city, ips in top_ip_cities:
+        s = ips / total_ips
+        cum += s
+        print(f"  {city:<30}  {s*100:5.2f}%  ({ips:>11,} IPs)  cum={cum*100:5.2f}%")
+
+    print("\n" + "=" * 70)
+    print("INTERPRETATION")
+    print("=" * 70)
+    print(f"""
+Cost concentration (DCs across states):   Gini={g_dc:.3f}  HHI={h_dc:.3f}
+Benefit dispersion (IPs across cities):   Gini={g_ip:.3f}  HHI={h_ip:.3f}
+
+A "concentrated costs / dispersed benefits" pattern requires:
+  (1) DCs cluster in a few places (high state-level Gini/HHI).
+  (2) Users they serve span many places (low city-level Gini/HHI, ideally).
+  (3) That clustering is plausibly tied to fixed infrastructure (cables).
+
+Check signs above:
+  - DC location vs cable proximity: see Mann-Whitney result.
+  - Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
+  - Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
+    the asymmetry (benefits more evenly distributed than costs).
+""")
+
+
+if __name__ == "__main__":
+    main()