Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input
data to data/, and generated HTML/CSV outputs to output/. Update path
references in 8 scripts to use Path(__file__).parent.parent as project
root so they work correctly from the new location. Update README links
and quick-start commands accordingly. Notebooks remain at root.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions

View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""Quick statistical analysis: are US data centers spatially tied to submarine
cables, and does the resulting pattern look like concentrated costs / dispersed
benefits?
"""
import math
import os
import statistics
from collections import Counter
import psycopg2
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
s = sorted(xs)
n = len(s)
out = {}
for q in qs:
if n == 0:
out[q] = None
continue
k = (n - 1) * q
lo, hi = math.floor(k), math.ceil(k)
if lo == hi:
out[q] = s[int(k)]
else:
out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
return out
def gini(values):
"""Standard Gini coefficient for non-negative values."""
v = sorted(x for x in values if x is not None and x >= 0)
n = len(v)
if n == 0 or sum(v) == 0:
return None
cum = 0.0
for i, x in enumerate(v, 1):
cum += i * x
return (2 * cum) / (n * sum(v)) - (n + 1) / n
def hhi(shares):
"""Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
return sum(s * s for s in shares)
def mann_whitney_u_z(xs, ys):
"""Approximate Mann-Whitney U test z-score (normal approx, large-n).
Returns (U, z, p_two_sided). Uses average ranks for ties.
"""
combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
combined.sort(key=lambda t: t[0])
ranks = [0.0] * len(combined)
i = 0
n = len(combined)
while i < n:
j = i
while j + 1 < n and combined[j + 1][0] == combined[i][0]:
j += 1
avg_rank = (i + j) / 2 + 1
for k in range(i, j + 1):
ranks[k] = avg_rank
i = j + 1
r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
n1, n2 = len(xs), len(ys)
U1 = r1 - n1 * (n1 + 1) / 2
mu = n1 * n2 / 2
sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z = (U1 - mu) / sigma if sigma else 0.0
# Two-sided p via error function
p = math.erfc(abs(z) / math.sqrt(2))
return U1, z, p
def main():
conn = connect()
cur = conn.cursor()
# --- 1. Distance from each US data center to nearest submarine cable ---
cur.execute(
"""
with cables_union as (
select ST_Union(geom)::geography as g from public.internet_cables
)
select ST_Distance(
ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
cu.g
) / 1000.0 -- meters -> km
from public.us_dc_sample_geocoded dc, cables_union cu
where dc.longitude is not null and dc.latitude is not null
and (dc.country = 'United States' or dc.country is null)
"""
)
dc_km = [float(r[0]) for r in cur.fetchall()]
# --- 2. Distance from US city-dominance points to nearest cable ---
cur.execute(
"""
with cables_union as (
select ST_Union(geom)::geography as g from public.internet_cables
)
select ST_Distance(
ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
cu.g
) / 1000.0
from public.internet_city_dominance c, cables_union cu
where c.country = 'US' and c.geom is not null
"""
)
city_km = [float(r[0]) for r in cur.fetchall()]
# --- 3. DC distribution by state (cost concentration) ---
cur.execute(
"""
select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
from public.us_dc_sample_geocoded
where longitude is not null and latitude is not null
group by 1
"""
)
state_counts = dict(cur.fetchall())
total_dc = sum(state_counts.values())
state_shares = {k: v / total_dc for k, v in state_counts.items()}
# --- 4. IP distribution across US cities (benefit dispersion proxy) ---
cur.execute(
"""
select city, coalesce(logical_dominance_ips, 0)
from public.internet_city_dominance
where country = 'US' and logical_dominance_ips is not null
"""
)
city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
total_ips = sum(v for _, v in city_ips)
ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
# --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
# Top-N US dominance cities, share of national IPs each captures.
top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
cur.close()
conn.close()
# ======= report =======
print("=" * 70)
print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
print("=" * 70)
print(f"\nN data centers analyzed: {len(dc_km):,}")
print(f"N US city-dominance pts: {len(city_km):,}")
def fmt_q(label, xs):
q = quantiles(xs)
print(f"\n{label}:")
print(f" mean = {statistics.mean(xs):,.1f} km")
print(f" median (p50) = {q[0.5]:,.1f} km")
print(f" p10 / p25 / p75 / p90 = "
f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
for thr in (10, 50, 100, 250):
frac = sum(1 for x in xs if x <= thr) / len(xs)
print(f" share within {thr:>3} km of a cable: {frac*100:5.1f}%")
fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
U, z, p = mann_whitney_u_z(dc_km, city_km)
print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
f"p≈{p:.2e}")
if statistics.median(dc_km) < statistics.median(city_km):
diff = statistics.median(city_km) - statistics.median(dc_km)
print(f" -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
else:
print(" -> DCs are not closer to cables than cities.")
print("\n" + "=" * 70)
print("CONCENTRATION OF COSTS (data centers by state)")
print("=" * 70)
g_dc = gini(list(state_counts.values()))
h_dc = hhi(list(state_shares.values()))
print(f"States covered: {len(state_counts)}")
print(f"Gini of DC counts across states: {g_dc:.3f} (0=even, 1=one state takes all)")
print(f"HHI of state shares: {h_dc:.3f} (0.18+ = highly concentrated)")
top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
cum = 0.0
print(f"\nTop states by share of US data centers:")
for st, s in top_states:
cum += s
print(f" {st}: {s*100:5.1f}% ({state_counts[st]:>4} DCs) cum={cum*100:5.1f}%")
print("\n" + "=" * 70)
print("DISPERSION OF BENEFITS (US IPs across cities)")
print("=" * 70)
g_ip = gini([v for _, v in city_ips])
h_ip = hhi(ip_shares)
print(f"US cities with IP data: {len(city_ips):,}")
print(f"Gini of IPs across cities: {g_ip:.3f}")
print(f"HHI of IP shares: {h_ip:.3f}")
cum = 0.0
print(f"\nTop US cities by share of national IPs:")
for city, ips in top_ip_cities:
s = ips / total_ips
cum += s
print(f" {city:<30} {s*100:5.2f}% ({ips:>11,} IPs) cum={cum*100:5.2f}%")
print("\n" + "=" * 70)
print("INTERPRETATION")
print("=" * 70)
print(f"""
Cost concentration (DCs across states): Gini={g_dc:.3f} HHI={h_dc:.3f}
Benefit dispersion (IPs across cities): Gini={g_ip:.3f} HHI={h_ip:.3f}
A "concentrated costs / dispersed benefits" pattern requires:
(1) DCs cluster in a few places (high state-level Gini/HHI).
(2) Users they serve span many places (low city-level Gini/HHI, ideally).
(3) That clustering is plausibly tied to fixed infrastructure (cables).
Check signs above:
- DC location vs cable proximity: see Mann-Whitney result.
- Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
- Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
the asymmetry (benefits more evenly distributed than costs).
""")
if __name__ == "__main__":
main()