Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input
data to data/, and generated HTML/CSV outputs to output/. Update path
references in 8 scripts to use Path(__file__).parent.parent as project
root so they work correctly from the new location. Update README links
and quick-start commands accordingly. Notebooks remain at root.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions

View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""Quick statistical analysis: are US data centers spatially tied to submarine
cables, and does the resulting pattern look like concentrated costs / dispersed
benefits?
"""
import math
import os
import statistics
from collections import Counter
import psycopg2
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
s = sorted(xs)
n = len(s)
out = {}
for q in qs:
if n == 0:
out[q] = None
continue
k = (n - 1) * q
lo, hi = math.floor(k), math.ceil(k)
if lo == hi:
out[q] = s[int(k)]
else:
out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
return out
def gini(values):
"""Standard Gini coefficient for non-negative values."""
v = sorted(x for x in values if x is not None and x >= 0)
n = len(v)
if n == 0 or sum(v) == 0:
return None
cum = 0.0
for i, x in enumerate(v, 1):
cum += i * x
return (2 * cum) / (n * sum(v)) - (n + 1) / n
def hhi(shares):
"""Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
return sum(s * s for s in shares)
def mann_whitney_u_z(xs, ys):
"""Approximate Mann-Whitney U test z-score (normal approx, large-n).
Returns (U, z, p_two_sided). Uses average ranks for ties.
"""
combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
combined.sort(key=lambda t: t[0])
ranks = [0.0] * len(combined)
i = 0
n = len(combined)
while i < n:
j = i
while j + 1 < n and combined[j + 1][0] == combined[i][0]:
j += 1
avg_rank = (i + j) / 2 + 1
for k in range(i, j + 1):
ranks[k] = avg_rank
i = j + 1
r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
n1, n2 = len(xs), len(ys)
U1 = r1 - n1 * (n1 + 1) / 2
mu = n1 * n2 / 2
sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z = (U1 - mu) / sigma if sigma else 0.0
# Two-sided p via error function
p = math.erfc(abs(z) / math.sqrt(2))
return U1, z, p
def main():
conn = connect()
cur = conn.cursor()
# --- 1. Distance from each US data center to nearest submarine cable ---
cur.execute(
"""
with cables_union as (
select ST_Union(geom)::geography as g from public.internet_cables
)
select ST_Distance(
ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
cu.g
) / 1000.0 -- meters -> km
from public.us_dc_sample_geocoded dc, cables_union cu
where dc.longitude is not null and dc.latitude is not null
and (dc.country = 'United States' or dc.country is null)
"""
)
dc_km = [float(r[0]) for r in cur.fetchall()]
# --- 2. Distance from US city-dominance points to nearest cable ---
cur.execute(
"""
with cables_union as (
select ST_Union(geom)::geography as g from public.internet_cables
)
select ST_Distance(
ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
cu.g
) / 1000.0
from public.internet_city_dominance c, cables_union cu
where c.country = 'US' and c.geom is not null
"""
)
city_km = [float(r[0]) for r in cur.fetchall()]
# --- 3. DC distribution by state (cost concentration) ---
cur.execute(
"""
select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
from public.us_dc_sample_geocoded
where longitude is not null and latitude is not null
group by 1
"""
)
state_counts = dict(cur.fetchall())
total_dc = sum(state_counts.values())
state_shares = {k: v / total_dc for k, v in state_counts.items()}
# --- 4. IP distribution across US cities (benefit dispersion proxy) ---
cur.execute(
"""
select city, coalesce(logical_dominance_ips, 0)
from public.internet_city_dominance
where country = 'US' and logical_dominance_ips is not null
"""
)
city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
total_ips = sum(v for _, v in city_ips)
ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
# --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
# Top-N US dominance cities, share of national IPs each captures.
top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
cur.close()
conn.close()
# ======= report =======
print("=" * 70)
print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
print("=" * 70)
print(f"\nN data centers analyzed: {len(dc_km):,}")
print(f"N US city-dominance pts: {len(city_km):,}")
def fmt_q(label, xs):
q = quantiles(xs)
print(f"\n{label}:")
print(f" mean = {statistics.mean(xs):,.1f} km")
print(f" median (p50) = {q[0.5]:,.1f} km")
print(f" p10 / p25 / p75 / p90 = "
f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
for thr in (10, 50, 100, 250):
frac = sum(1 for x in xs if x <= thr) / len(xs)
print(f" share within {thr:>3} km of a cable: {frac*100:5.1f}%")
fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
U, z, p = mann_whitney_u_z(dc_km, city_km)
print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
f"p≈{p:.2e}")
if statistics.median(dc_km) < statistics.median(city_km):
diff = statistics.median(city_km) - statistics.median(dc_km)
print(f" -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
else:
print(" -> DCs are not closer to cables than cities.")
print("\n" + "=" * 70)
print("CONCENTRATION OF COSTS (data centers by state)")
print("=" * 70)
g_dc = gini(list(state_counts.values()))
h_dc = hhi(list(state_shares.values()))
print(f"States covered: {len(state_counts)}")
print(f"Gini of DC counts across states: {g_dc:.3f} (0=even, 1=one state takes all)")
print(f"HHI of state shares: {h_dc:.3f} (0.18+ = highly concentrated)")
top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
cum = 0.0
print(f"\nTop states by share of US data centers:")
for st, s in top_states:
cum += s
print(f" {st}: {s*100:5.1f}% ({state_counts[st]:>4} DCs) cum={cum*100:5.1f}%")
print("\n" + "=" * 70)
print("DISPERSION OF BENEFITS (US IPs across cities)")
print("=" * 70)
g_ip = gini([v for _, v in city_ips])
h_ip = hhi(ip_shares)
print(f"US cities with IP data: {len(city_ips):,}")
print(f"Gini of IPs across cities: {g_ip:.3f}")
print(f"HHI of IP shares: {h_ip:.3f}")
cum = 0.0
print(f"\nTop US cities by share of national IPs:")
for city, ips in top_ip_cities:
s = ips / total_ips
cum += s
print(f" {city:<30} {s*100:5.2f}% ({ips:>11,} IPs) cum={cum*100:5.2f}%")
print("\n" + "=" * 70)
print("INTERPRETATION")
print("=" * 70)
print(f"""
Cost concentration (DCs across states): Gini={g_dc:.3f} HHI={h_dc:.3f}
Benefit dispersion (IPs across cities): Gini={g_ip:.3f} HHI={h_ip:.3f}
A "concentrated costs / dispersed benefits" pattern requires:
(1) DCs cluster in a few places (high state-level Gini/HHI).
(2) Users they serve span many places (low city-level Gini/HHI, ideally).
(3) That clustering is plausibly tied to fixed infrastructure (cables).
Check signs above:
- DC location vs cable proximity: see Mann-Whitney result.
- Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
- Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
the asymmetry (benefits more evenly distributed than costs).
""")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,320 @@
#!/usr/bin/env python3
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
for US data-center siting.
Cost-bearing universe = tracts that host at least one DC
(public.data_center_census_tracts_2024)
Comparison universe = ACS 2024 5-yr tracts in the selected states
(census_tract_acs_2024_selected_states.csv)
"""
import csv
import math
import os
import statistics
from collections import Counter
import psycopg2
CSV_PATH = "census_tract_acs_2024_selected_states.csv"
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
def gini(values):
v = sorted(x for x in values if x is not None and x >= 0)
n = len(v)
if n == 0 or sum(v) == 0:
return None
cum = sum(i * x for i, x in enumerate(v, 1))
return (2 * cum) / (n * sum(v)) - (n + 1) / n
def hhi(shares):
return sum(s * s for s in shares)
def median(xs):
xs = [x for x in xs if x is not None]
return statistics.median(xs) if xs else None
def mean(xs):
xs = [x for x in xs if x is not None]
return statistics.mean(xs) if xs else None
def wmean(xs, ws):
pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
if not pairs:
return None
total = sum(w for _, w in pairs)
return sum(x * w for x, w in pairs) / total
def to_float(s):
try:
return float(s)
except (TypeError, ValueError):
return None
def to_int(s):
try:
return int(float(s))
except (TypeError, ValueError):
return None
def main():
conn = connect()
cur = conn.cursor()
# DC-hosting tracts (the cost-bearing universe) ----------------------
cur.execute(
"""
select
geoid,
statefp,
data_center_count,
population,
households,
broadband_subscription_pct,
median_household_income,
per_capita_income,
poverty_rate,
non_hispanic_white_pct,
non_hispanic_black_pct,
hispanic_latino_pct,
non_hispanic_asian_pct,
primary_industry,
land_area_sqm,
industry_information_workers,
industry_total_workers
from public.data_center_census_tracts_2024
"""
)
dc_tracts = []
for r in cur.fetchall():
dc_tracts.append(
{
"geoid": r[0],
"statefp": r[1],
"dc_count": r[2] or 0,
"pop": r[3],
"hh": r[4],
"broadband_pct": float(r[5]) if r[5] is not None else None,
"mhi": r[6],
"pci": r[7],
"poverty": float(r[8]) if r[8] is not None else None,
"white_pct": float(r[9]) if r[9] is not None else None,
"black_pct": float(r[10]) if r[10] is not None else None,
"hisp_pct": float(r[11]) if r[11] is not None else None,
"asian_pct": float(r[12]) if r[12] is not None else None,
"primary_industry": r[13],
"land_sqm": r[14],
"info_workers": r[15],
"total_workers": r[16],
}
)
# Distance from each DC tract to nearest cable (km) ----------------
cur.execute(
"""
with cables as (select ST_Union(geom)::geography g from public.internet_cables)
select t.geoid,
ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
from public.data_center_census_tracts_2024 t, cables c
"""
)
dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
for t in dc_tracts:
t["dist_km"] = dist_by_geoid.get(t["geoid"])
cur.close()
conn.close()
# Comparison universe from the wider ACS CSV ------------------------
universe = []
with open(CSV_PATH, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
universe.append(
{
"geoid": row["geoid"],
"statefp": row["statefp"],
"pop": to_int(row["population"]),
"broadband_pct": to_float(row["broadband_subscription_pct"]),
"mhi": to_int(row["median_household_income"]),
"pci": to_int(row["per_capita_income"]),
"poverty": to_float(row["poverty_rate"]),
"white_pct": to_float(row["non_hispanic_white_pct"]),
"black_pct": to_float(row["non_hispanic_black_pct"]),
"hisp_pct": to_float(row["hispanic_latino_pct"]),
"asian_pct": to_float(row["non_hispanic_asian_pct"]),
}
)
dc_geoids = {t["geoid"] for t in dc_tracts}
non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
# Restrict comparison to states actually represented in the DC sample
dc_states = {t["statefp"] for t in dc_tracts}
universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
# ============== report ==============
print("=" * 72)
print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
print("=" * 72)
total_dc = sum(t["dc_count"] for t in dc_tracts)
print(f"\nDC-hosting tracts: {len(dc_tracts):,}")
print(f"Data centers in those tracts: {total_dc:,}")
print(f"ACS universe (selected states): {len(universe):,} tracts")
print(f"States represented in DC sample: {len(dc_states)}")
print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts")
# --- Cost concentration at the tract level ---
print("\n" + "-" * 72)
print("1. COST CONCENTRATION (DCs across tracts)")
print("-" * 72)
counts = [t["dc_count"] for t in dc_tracts]
shares = [c / total_dc for c in counts]
g_dc = gini(counts)
h_dc = hhi(shares)
print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}")
print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}")
# Top 1% / 5% of tracts share
top1 = max(1, len(counts) // 100)
top5 = max(1, len(counts) // 20)
s = sorted(counts, reverse=True)
print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold "
f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold "
f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
# How small a fraction of population lives in a DC tract?
pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}")
print(f"Total population (DC-states ACS universe): {pop_universe:>11,}")
if pop_universe:
print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
f"live in a DC-hosting tract")
# Per-capita DC density
if pop_dc:
print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
if pop_universe and total_dc:
print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
f"averaged across DC-state population")
# --- Profile of cost-bearing communities ---
print("\n" + "-" * 72)
print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
print("-" * 72)
fields = [
("Median household income ($)", "mhi", "{:>10,.0f}"),
("Per-capita income ($)", "pci", "{:>10,.0f}"),
("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"),
("Poverty rate (%)", "poverty", "{:>10,.1f}"),
("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"),
("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"),
("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"),
("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"),
]
label_w = max(len(lbl) for lbl, *_ in fields)
print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} "
f"{'Δ (DC peer)':>15}")
for label, key, fmt in fields:
dc_med = median([t[key] for t in dc_tracts])
peer_med = median([u[key] for u in non_dc_in_dc_states])
if dc_med is None or peer_med is None:
continue
delta = dc_med - peer_med
cell_dc = fmt.format(dc_med)
cell_pe = fmt.format(peer_med)
cell_dl = fmt.format(delta)
print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}")
print("\nPopulation-weighted means (DC tracts):")
pops = [t["pop"] for t in dc_tracts]
for label, key, _ in fields:
wm = wmean([t[key] for t in dc_tracts], pops)
if wm is not None:
print(f" {label:<{label_w}} {wm:>12,.1f}")
print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
print(f" {n:>4} {industry}")
# --- Cable vs. inland subgroups ---
print("\n" + "-" * 72)
print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
print("-" * 72)
near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
f"{sum(t['dc_count'] for t in near):>4} DCs")
print(f">100 km from a submarine cable: {len(far):>3} tracts, "
f"{sum(t['dc_count'] for t in far):>4} DCs")
if near and far:
print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} "
f"far={median([t['mhi'] for t in far]):>10,.0f}")
print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} "
f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} "
f"far={median([t['dc_count'] for t in far]):>10,.0f}")
# --- Benefit-side proxy ---
print("\n" + "-" * 72)
print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
print("-" * 72)
# Total broadband subscribers approx = households * broadband_pct
subs = []
for u in universe_in_dc_states:
hh = None # households not in CSV; use population/2.5 fallback
if u["pop"] and u["broadband_pct"] is not None:
est_hh = u["pop"] / 2.5
subs.append(est_hh * u["broadband_pct"] / 100.0)
total_subs = sum(subs)
sg = gini(subs)
sh = hhi([s / total_subs for s in subs]) if total_subs else None
print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}")
print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}")
print(f"HHI of subscribers across tracts: {sh:.5f}")
# Compare to DC HHI
print(f"\nSide-by-side concentration (lower = more dispersed):")
print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}")
print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} "
f"({h_dc/sh:.0f}x more concentrated for DCs)")
print("\n" + "=" * 72)
print("BOTTOM LINE")
print("=" * 72)
n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
print(f"""
- DCs are extremely concentrated at the tract level: top 1% of host tracts
hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
live in a DC-hosting tract — costs (land use, power draw, water, traffic,
noise) fall on a tiny minority of communities.
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
tracts. See deltas above for the demographic profile.
- Broadband subscribers (proxy for who consumes cloud services) are far more
evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
""")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,397 @@
#!/usr/bin/env python3
"""Build data-center broadband connection tables.
Creates a per-data-center broadband connection table and, when FCC BDC API
credentials are available, stores the FCC BDC public download catalog.
Required DB env vars:
PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD
FCC API env vars:
FCC_USERNAME or FCC_BDC_USERNAME - FCC User Registration username/email
FCC_API_KEY or FCC_HASH_VALUE - BDC public API hash_value token
"""
from __future__ import annotations
import argparse
import json
import os
import subprocess
import sys
from datetime import date, datetime
from pathlib import Path
from typing import Any
import psycopg2
import requests
from psycopg2.extras import Json, execute_values
DB_NAME = "data_centers"
MASTER_TABLE = "public.master_data_centers"
TRACT_TABLE = "public.data_center_census_tracts_2024"
AS_OF_TABLE = "public.fcc_bdc_api_as_of_dates"
FILES_TABLE = "public.fcc_bdc_availability_files"
CONNECTION_TABLE = "public.data_center_broadband_connection"
FCC_BASE_URL = "https://broadbandmap.fcc.gov/api/public"
USER_AGENT = "data-center-fcc-bdc-loader/1.0"
def load_zsh_secrets() -> None:
"""Load shell secrets into this process without printing values."""
secrets = Path.home() / ".zsh_secrets"
if not secrets.exists():
return
result = subprocess.run(
["zsh", "-lc", "source ~/.zsh_secrets >/dev/null 2>&1; env"],
check=True,
capture_output=True,
text=True,
)
for line in result.stdout.splitlines():
if "=" not in line:
continue
key, value = line.split("=", 1)
if key and key not in os.environ:
os.environ[key] = value
def require_env(keys: list[str]) -> None:
missing = [k for k in keys if not os.getenv(k)]
if missing:
raise RuntimeError("Missing required env vars: " + ", ".join(missing))
def get_conn():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
def fcc_credentials() -> tuple[str | None, str | None]:
username = os.getenv("FCC_USERNAME") or os.getenv("FCC_BDC_USERNAME")
hash_value = os.getenv("FCC_API_KEY") or os.getenv("FCC_HASH_VALUE")
return username, hash_value
def fcc_get(path: str, *, params: dict[str, Any] | None = None) -> dict[str, Any]:
username, hash_value = fcc_credentials()
if not username or not hash_value:
raise RuntimeError(
"FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
"FCC_API_KEY or FCC_HASH_VALUE."
)
url = f"{FCC_BASE_URL}{path}"
headers = {
"username": username,
"hash_value": hash_value,
"user-agent": USER_AGENT,
"accept": "application/json",
}
response = requests.get(url, headers=headers, params=params or {}, timeout=60)
response.raise_for_status()
payload = response.json()
if str(payload.get("status_code")) in {"401", "403"} or payload.get("status") == "fail":
raise RuntimeError(f"FCC API error for {path}: {payload}")
return payload
def parse_date(value: Any) -> date | None:
if value in (None, ""):
return None
if isinstance(value, date):
return value
return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()
def to_int(value: Any) -> int | None:
if value in (None, ""):
return None
try:
return int(str(value).replace(",", ""))
except (TypeError, ValueError):
return None
def create_tables(cur) -> None:
cur.execute("create extension if not exists postgis")
cur.execute(
f"""
create table if not exists {AS_OF_TABLE} (
data_type text not null,
as_of_date date not null,
raw jsonb not null,
fetched_at timestamptz not null default now(),
primary key (data_type, as_of_date)
)
"""
)
cur.execute(
f"""
create table if not exists {FILES_TABLE} (
as_of_date date not null,
file_id bigint not null,
category text,
subcategory text,
technology_type text,
technology_code text,
technology_code_desc text,
speed_tier text,
state_fips text,
state_name text,
provider_id bigint,
provider_name text,
file_type text,
file_name text,
record_count bigint,
raw jsonb not null,
fetched_at timestamptz not null default now(),
primary key (as_of_date, file_id)
)
"""
)
cur.execute(
f"create index if not exists fcc_bdc_availability_files_category_idx "
f"on {FILES_TABLE} (category, subcategory)"
)
cur.execute(
f"create index if not exists fcc_bdc_availability_files_state_idx "
f"on {FILES_TABLE} (state_fips)"
)
cur.execute(
f"create index if not exists fcc_bdc_availability_files_provider_idx "
f"on {FILES_TABLE} (provider_id)"
)
cur.execute(
f"""
create table if not exists {CONNECTION_TABLE} (
master_id text primary key references public.master_data_centers(master_id) on delete cascade,
source text,
name text,
operator text,
city text,
state text,
country text,
longitude double precision,
latitude double precision,
geom geometry(Point, 4326),
census_tract_geoid text,
census_broadband_subscription_pct numeric,
fcc_bdc_status text not null,
fcc_bdc_as_of_date date,
fcc_bdc_geography_type text,
fcc_bdc_geoid text,
fcc_provider_count integer,
fcc_fiber_provider_count integer,
fcc_cable_provider_count integer,
fcc_fixed_wireless_provider_count integer,
fcc_max_advertised_download_mbps numeric,
fcc_max_advertised_upload_mbps numeric,
fcc_100_20_provider_count integer,
fcc_summary_json jsonb,
fetched_at timestamptz not null default now(),
updated_at timestamptz not null default now()
)
"""
)
cur.execute(
f"create index if not exists data_center_broadband_connection_geom_gix "
f"on {CONNECTION_TABLE} using gist (geom)"
)
cur.execute(
f"create index if not exists data_center_broadband_connection_tract_idx "
f"on {CONNECTION_TABLE} (census_tract_geoid)"
)
cur.execute(
f"create index if not exists data_center_broadband_connection_status_idx "
f"on {CONNECTION_TABLE} (fcc_bdc_status)"
)
def rebuild_connection_base(cur, status: str) -> int:
cur.execute(f"truncate {CONNECTION_TABLE}")
cur.execute(
f"""
insert into {CONNECTION_TABLE} (
master_id, source, name, operator, city, state, country,
longitude, latitude, geom,
census_tract_geoid, census_broadband_subscription_pct,
fcc_bdc_status
)
select
dc.master_id, dc.source, dc.name, dc.operator, dc.city, dc.state, dc.country,
dc.longitude, dc.latitude, dc.geom,
dc.geoid as census_tract_geoid,
tr.broadband_subscription_pct as census_broadband_subscription_pct,
%s as fcc_bdc_status
from {MASTER_TABLE} dc
left join {TRACT_TABLE} tr on tr.geoid::text = dc.geoid::text
"""
,
(status,),
)
cur.execute(f"select count(*) from {CONNECTION_TABLE}")
return cur.fetchone()[0]
def latest_availability_date(rows: list[dict[str, Any]]) -> date | None:
dates = [
parse_date(r.get("as_of_date"))
for r in rows
if str(r.get("data_type", "")).lower() in {"availability", "availability data"}
]
dates = [d for d in dates if d is not None]
return max(dates) if dates else None
def load_as_of_dates(cur) -> date:
payload = fcc_get("/map/listAsOfDates")
rows = payload.get("data") or []
values = []
for row in rows:
as_of_date = parse_date(row.get("as_of_date"))
if not as_of_date:
continue
values.append((row.get("data_type"), as_of_date, Json(row)))
if values:
execute_values(
cur,
f"""
insert into {AS_OF_TABLE} (data_type, as_of_date, raw)
values %s
on conflict (data_type, as_of_date) do update set
raw = excluded.raw,
fetched_at = now()
""",
values,
page_size=1000,
)
latest = latest_availability_date(rows)
if latest is None:
raise RuntimeError(f"Could not find an availability as_of_date in FCC response: {rows}")
return latest
def load_availability_file_catalog(cur, as_of_date: date) -> int:
payload = fcc_get(
f"/map/downloads/listAvailabilityData/{as_of_date:%Y-%m-%d}",
params={"technology_type": "Fixed Broadband"},
)
rows = payload.get("data") or []
values = []
for row in rows:
file_id = to_int(row.get("file_id"))
if file_id is None:
continue
values.append(
(
as_of_date,
file_id,
row.get("category"),
row.get("subcategory"),
row.get("technology_type"),
row.get("technology_code"),
row.get("technology_code_desc"),
row.get("speed_tier"),
row.get("state_fips"),
row.get("state_name"),
to_int(row.get("provider_id")),
row.get("provider_name"),
row.get("file_type"),
row.get("file_name"),
to_int(row.get("record_count")),
Json(row),
)
)
if values:
cur.execute(f"delete from {FILES_TABLE} where as_of_date = %s", (as_of_date,))
execute_values(
cur,
f"""
insert into {FILES_TABLE} (
as_of_date, file_id, category, subcategory, technology_type,
technology_code, technology_code_desc, speed_tier, state_fips,
state_name, provider_id, provider_name, file_type, file_name,
record_count, raw
)
values %s
""",
values,
page_size=1000,
)
return len(values)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--skip-fcc", action="store_true", help="Only create/rebuild the base connection table.")
parser.add_argument("--as-of-date", help="FCC BDC availability as-of date, YYYY-MM-DD. Defaults to latest.")
args = parser.parse_args()
load_zsh_secrets()
require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
username, hash_value = fcc_credentials()
status = "pending_fcc_username" if hash_value and not username else "pending_fcc_catalog"
if args.skip_fcc:
status = "fcc_skipped"
with get_conn() as conn:
with conn.cursor() as cur:
create_tables(cur)
n_connection = rebuild_connection_base(cur, status)
print(f"{CONNECTION_TABLE}: {n_connection:,} base rows")
if args.skip_fcc:
conn.commit()
return 0
if not username or not hash_value:
print(
"FCC catalog not loaded: set FCC_USERNAME or FCC_BDC_USERNAME "
"alongside FCC_API_KEY/FCC_HASH_VALUE in ~/.zsh_secrets.",
file=sys.stderr,
)
conn.commit()
return 2
as_of_date = parse_date(args.as_of_date) if args.as_of_date else load_as_of_dates(cur)
n_files = load_availability_file_catalog(cur, as_of_date)
cur.execute(
f"""
update {CONNECTION_TABLE}
set fcc_bdc_status = 'fcc_catalog_loaded',
fcc_bdc_as_of_date = %s,
updated_at = now()
""",
(as_of_date,),
)
conn.commit()
print(f"{AS_OF_TABLE}: loaded latest availability date {as_of_date}")
print(f"{FILES_TABLE}: {n_files:,} fixed-broadband file catalog rows")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,806 @@
#!/usr/bin/env python3
"""Build FCC BDC provider aggregates for data-center counties and tracts.
This script uses FCC BDC State / Location Coverage files. Those files are
provider/location-level and include block GEOIDs, so they can be aggregated to
county and tract provider counts for only the geographies that contain data
centers.
"""
from __future__ import annotations
import argparse
import os
import tempfile
import zipfile
from collections.abc import Iterable
from datetime import date
from pathlib import Path
from typing import Any
import pandas as pd
import requests
from psycopg2.extras import execute_values
from build_fcc_bdc_broadband_connection_table import (
CONNECTION_TABLE,
FCC_BASE_URL,
FILES_TABLE,
USER_AGENT,
fcc_credentials,
get_conn,
load_zsh_secrets,
parse_date,
require_env,
)
DETAIL_TABLE = "public.fcc_bdc_location_provider_geography_provider"
AGG_TABLE = "public.fcc_bdc_location_provider_aggregate"
PROGRESS_TABLE = "public.fcc_bdc_location_provider_file_progress"
CROSSWALK_TABLE = "public.fcc_bdc_geoid_crosswalk"
TERRESTRIAL_TECHNOLOGY_CODES = ("10", "40", "50", "70", "71", "72")
FIXED_WIRELESS_CODES = {"70", "71", "72"}
CSV_USECOLS = [
"provider_id",
"block_geoid",
"technology",
"max_advertised_download_speed",
"max_advertised_upload_speed",
"business_residential_code",
]
CT_PLANNING_TO_LEGACY_TRACT_GEOIDS = {
"09110520302": "09003520302",
"09120090500": "09001090500",
"09170175800": "09009175800",
"09190020101": "09001020101",
"09190020900": "09001020900",
"09190044300": "09001044300",
}
def fcc_download_headers() -> dict[str, str]:
username, hash_value = fcc_credentials()
if not username or not hash_value:
raise RuntimeError(
"FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
"FCC_API_KEY or FCC_HASH_VALUE."
)
return {
"username": username,
"hash_value": hash_value,
"user-agent": USER_AGENT,
"accept": "application/zip,*/*",
}
def normalize_codes(values: Iterable[str]) -> tuple[str, ...]:
return tuple(str(v).strip() for v in values if str(v).strip())
def create_tables(cur) -> None:
cur.execute(
f"""
create table if not exists {DETAIL_TABLE} (
as_of_date date not null,
file_id bigint not null,
geography_type text not null check (geography_type in ('County', 'Tract')),
geoid text not null,
provider_id bigint not null,
has_fiber boolean not null default false,
has_cable boolean not null default false,
has_fixed_wireless boolean not null default false,
has_copper boolean not null default false,
has_100_20 boolean not null default false,
has_business boolean not null default false,
has_business_fiber boolean not null default false,
has_business_100_20 boolean not null default false,
max_advertised_download_mbps numeric,
max_advertised_upload_mbps numeric,
matched_location_rows bigint not null default 0,
updated_at timestamptz not null default now(),
primary key (as_of_date, file_id, geography_type, geoid, provider_id)
)
"""
)
cur.execute(
f"create index if not exists fcc_bdc_location_provider_geo_idx "
f"on {DETAIL_TABLE} (as_of_date, geography_type, geoid)"
)
cur.execute(
f"""
create table if not exists {AGG_TABLE} (
as_of_date date not null,
geography_type text not null check (geography_type in ('County', 'Tract')),
geoid text not null,
provider_count integer not null,
fiber_provider_count integer not null,
cable_provider_count integer not null,
fixed_wireless_provider_count integer not null,
copper_provider_count integer not null,
provider_100_20_count integer not null,
business_provider_count integer not null,
business_fiber_provider_count integer not null,
business_100_20_provider_count integer not null,
max_advertised_download_mbps numeric,
max_advertised_upload_mbps numeric,
matched_location_rows bigint not null,
provider_file_rows bigint not null,
updated_at timestamptz not null default now(),
primary key (as_of_date, geography_type, geoid)
)
"""
)
cur.execute(
f"""
create table if not exists {PROGRESS_TABLE} (
as_of_date date not null,
file_id bigint not null,
state_fips text not null,
technology_code text,
technology_code_desc text,
record_count bigint,
matched_location_rows bigint not null,
provider_geo_rows bigint not null,
processed_at timestamptz not null default now(),
primary key (as_of_date, file_id)
)
"""
)
cur.execute(
f"""
create table if not exists {CROSSWALK_TABLE} (
source_geography_type text not null,
source_geoid text not null,
fcc_geography_type text not null,
fcc_geoid text not null,
method text not null,
notes text,
updated_at timestamptz not null default now(),
primary key (source_geography_type, source_geoid, fcc_geography_type)
)
"""
)
add_columns = [
"fcc_provider_geography_type text",
"fcc_provider_geoid text",
"fcc_county_provider_count integer",
"fcc_county_fiber_provider_count integer",
"fcc_county_cable_provider_count integer",
"fcc_county_fixed_wireless_provider_count integer",
"fcc_county_100_20_provider_count integer",
"fcc_county_business_provider_count integer",
"fcc_county_business_fiber_provider_count integer",
"fcc_county_business_100_20_provider_count integer",
"fcc_county_max_advertised_download_mbps numeric",
"fcc_county_max_advertised_upload_mbps numeric",
"fcc_tract_provider_count integer",
"fcc_tract_fiber_provider_count integer",
"fcc_tract_cable_provider_count integer",
"fcc_tract_fixed_wireless_provider_count integer",
"fcc_tract_100_20_provider_count integer",
"fcc_tract_business_provider_count integer",
"fcc_tract_business_fiber_provider_count integer",
"fcc_tract_business_100_20_provider_count integer",
"fcc_tract_max_advertised_download_mbps numeric",
"fcc_tract_max_advertised_upload_mbps numeric",
]
for definition in add_columns:
cur.execute(f"alter table {CONNECTION_TABLE} add column if not exists {definition}")
def seed_geoid_crosswalk(cur) -> None:
values = [
(
"Tract",
source_geoid,
"Tract",
fcc_geoid,
"ct_planning_region_to_legacy_county_same_tractce",
"Connecticut 2024 tract GEOIDs use planning-region county equivalents; FCC BDC block GEOIDs use legacy county codes.",
)
for source_geoid, fcc_geoid in CT_PLANNING_TO_LEGACY_TRACT_GEOIDS.items()
]
execute_values(
cur,
f"""
insert into {CROSSWALK_TABLE} (
source_geography_type, source_geoid, fcc_geography_type,
fcc_geoid, method, notes
)
values %s
on conflict (source_geography_type, source_geoid, fcc_geography_type)
do update set
fcc_geoid = excluded.fcc_geoid,
method = excluded.method,
notes = excluded.notes,
updated_at = now()
""",
values,
)
def latest_catalog_date(cur) -> date:
cur.execute(f"select max(as_of_date) from {FILES_TABLE}")
value = cur.fetchone()[0]
if value is None:
raise RuntimeError(f"No FCC catalog rows found in {FILES_TABLE}. Run the FCC catalog load first.")
return value
def target_geographies(cur, states: tuple[str, ...] | None = None) -> tuple[set[str], set[str], set[str]]:
state_filter = ""
params: list[Any] = []
if states:
state_filter = "where left(census_tract_geoid, 2) = any(%s)"
params.append(list(states))
cur.execute(
f"""
select distinct
left(census_tract_geoid, 2) as state_fips,
left(census_tract_geoid, 5) as county_geoid,
left(census_tract_geoid, 11) as tract_geoid
from {CONNECTION_TABLE}
{state_filter}
""",
params,
)
rows = cur.fetchall()
states_found = {r[0] for r in rows if r[0]}
counties = {r[1] for r in rows if r[1]}
tracts = {r[2] for r in rows if r[2]}
if tracts:
cur.execute(
f"""
select fcc_geoid
from {CROSSWALK_TABLE}
where source_geography_type = 'Tract'
and fcc_geography_type = 'Tract'
and source_geoid = any(%s)
""",
(list(tracts),),
)
fcc_tracts = {r[0] for r in cur.fetchall() if r[0]}
tracts.update(fcc_tracts)
counties.update({geoid[:5] for geoid in fcc_tracts})
return states_found, counties, tracts
def catalog_files(
cur,
as_of_date: date,
states: set[str],
technology_codes: tuple[str, ...],
limit: int | None,
) -> list[dict[str, Any]]:
cur.execute(
f"""
select file_id, state_fips, technology_code, technology_code_desc, file_name, record_count
from {FILES_TABLE}
where as_of_date = %s
and category = 'State'
and subcategory = 'Location Coverage'
and state_fips = any(%s)
and technology_code = any(%s)
order by state_fips, technology_code, file_id
""",
(as_of_date, list(states), list(technology_codes)),
)
rows = [
{
"file_id": int(file_id),
"state_fips": state_fips,
"technology_code": str(technology_code),
"technology_code_desc": technology_code_desc,
"file_name": file_name,
"record_count": record_count,
}
for file_id, state_fips, technology_code, technology_code_desc, file_name, record_count in cur.fetchall()
]
return rows[:limit] if limit is not None else rows
def progress_done(cur, as_of_date: date, file_id: int) -> bool:
cur.execute(
f"select 1 from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
(as_of_date, file_id),
)
return cur.fetchone() is not None
def download_file(file_id: int, dest_dir: Path) -> Path:
url = f"{FCC_BASE_URL}/map/downloads/downloadFile/availability/{file_id}"
path = dest_dir / f"fcc_bdc_availability_{file_id}.zip"
with requests.get(url, headers=fcc_download_headers(), stream=True, timeout=(15, 300)) as response:
response.raise_for_status()
with path.open("wb") as fh:
for chunk in response.iter_content(chunk_size=1024 * 1024):
if chunk:
fh.write(chunk)
return path
def normalize_block_geoid(series: pd.Series) -> pd.Series:
return series.astype("string").str.replace(r"\.0$", "", regex=True).str.zfill(15)
def summarize_matches(
chunk: pd.DataFrame,
geography_type: str,
target_geoids: set[str],
) -> tuple[pd.DataFrame, int]:
geoid_len = 5 if geography_type == "County" else 11
geoid = chunk["block_geoid_norm"].str[:geoid_len]
matched = chunk[geoid.isin(target_geoids)].copy()
if matched.empty:
return pd.DataFrame(), 0
matched["geoid"] = geoid[matched.index]
matched["provider_id_num"] = pd.to_numeric(matched["provider_id"], errors="coerce")
matched = matched[matched["provider_id_num"].notna()].copy()
if matched.empty:
return pd.DataFrame(), 0
tech = matched["technology"].astype("string").str.replace(r"\.0$", "", regex=True)
down = pd.to_numeric(matched["max_advertised_download_speed"], errors="coerce")
upload = pd.to_numeric(matched["max_advertised_upload_speed"], errors="coerce")
business_code = matched["business_residential_code"].astype("string").str.upper().fillna("")
business = business_code.isin(["B", "X"])
matched["provider_id_num"] = matched["provider_id_num"].astype("int64")
matched["has_fiber"] = tech.eq("50")
matched["has_cable"] = tech.eq("40")
matched["has_fixed_wireless"] = tech.isin(FIXED_WIRELESS_CODES)
matched["has_copper"] = tech.eq("10")
matched["has_100_20"] = down.ge(100) & upload.ge(20)
matched["has_business"] = business
matched["has_business_fiber"] = business & matched["has_fiber"]
matched["has_business_100_20"] = business & matched["has_100_20"]
matched["max_down"] = down
matched["max_up"] = upload
matched["matched_location_rows"] = 1
grouped = (
matched.groupby(["geoid", "provider_id_num"], as_index=False)
.agg(
has_fiber=("has_fiber", "max"),
has_cable=("has_cable", "max"),
has_fixed_wireless=("has_fixed_wireless", "max"),
has_copper=("has_copper", "max"),
has_100_20=("has_100_20", "max"),
has_business=("has_business", "max"),
has_business_fiber=("has_business_fiber", "max"),
has_business_100_20=("has_business_100_20", "max"),
max_down=("max_down", "max"),
max_up=("max_up", "max"),
matched_location_rows=("matched_location_rows", "sum"),
)
)
return grouped, len(matched)
def upsert_detail(
cur,
as_of_date: date,
file_id: int,
geography_type: str,
grouped: pd.DataFrame,
) -> int:
if grouped.empty:
return 0
values = [
(
as_of_date,
file_id,
geography_type,
row.geoid,
int(row.provider_id_num),
bool(row.has_fiber),
bool(row.has_cable),
bool(row.has_fixed_wireless),
bool(row.has_copper),
bool(row.has_100_20),
bool(row.has_business),
bool(row.has_business_fiber),
bool(row.has_business_100_20),
None if pd.isna(row.max_down) else float(row.max_down),
None if pd.isna(row.max_up) else float(row.max_up),
int(row.matched_location_rows),
)
for row in grouped.itertuples(index=False)
]
execute_values(
cur,
f"""
insert into {DETAIL_TABLE} (
as_of_date, file_id, geography_type, geoid, provider_id,
has_fiber, has_cable, has_fixed_wireless, has_copper,
has_100_20, has_business, has_business_fiber, has_business_100_20,
max_advertised_download_mbps, max_advertised_upload_mbps,
matched_location_rows
)
values %s
on conflict (as_of_date, file_id, geography_type, geoid, provider_id)
do update set
has_fiber = {DETAIL_TABLE}.has_fiber or excluded.has_fiber,
has_cable = {DETAIL_TABLE}.has_cable or excluded.has_cable,
has_fixed_wireless = {DETAIL_TABLE}.has_fixed_wireless or excluded.has_fixed_wireless,
has_copper = {DETAIL_TABLE}.has_copper or excluded.has_copper,
has_100_20 = {DETAIL_TABLE}.has_100_20 or excluded.has_100_20,
has_business = {DETAIL_TABLE}.has_business or excluded.has_business,
has_business_fiber = {DETAIL_TABLE}.has_business_fiber or excluded.has_business_fiber,
has_business_100_20 = {DETAIL_TABLE}.has_business_100_20 or excluded.has_business_100_20,
max_advertised_download_mbps = greatest(
{DETAIL_TABLE}.max_advertised_download_mbps,
excluded.max_advertised_download_mbps
),
max_advertised_upload_mbps = greatest(
{DETAIL_TABLE}.max_advertised_upload_mbps,
excluded.max_advertised_upload_mbps
),
matched_location_rows = {DETAIL_TABLE}.matched_location_rows + excluded.matched_location_rows,
updated_at = now()
""",
values,
page_size=1000,
)
return len(values)
def process_file(
conn,
file_row: dict[str, Any],
as_of_date: date,
county_geoids: set[str],
tract_geoids: set[str],
chunksize: int,
temp_dir: Path,
) -> tuple[int, int]:
file_id = file_row["file_id"]
zip_path = download_file(file_id, temp_dir)
matched_rows = 0
provider_geo_rows = 0
try:
with zipfile.ZipFile(zip_path) as archive:
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
if not csv_names:
raise RuntimeError(f"FCC file_id={file_id} did not contain a CSV: {archive.namelist()}")
with archive.open(csv_names[0]) as csv_file:
reader = pd.read_csv(
csv_file,
usecols=CSV_USECOLS,
dtype="string",
chunksize=chunksize,
low_memory=False,
)
with conn.cursor() as cur:
cur.execute(
f"delete from {DETAIL_TABLE} where as_of_date = %s and file_id = %s",
(as_of_date, file_id),
)
cur.execute(
f"delete from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
(as_of_date, file_id),
)
conn.commit()
for chunk_number, chunk in enumerate(reader, start=1):
chunk["block_geoid_norm"] = normalize_block_geoid(chunk["block_geoid"])
county_grouped, county_matches = summarize_matches(chunk, "County", county_geoids)
tract_grouped, tract_matches = summarize_matches(chunk, "Tract", tract_geoids)
with conn.cursor() as cur:
provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "County", county_grouped)
provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "Tract", tract_grouped)
conn.commit()
matched_rows += county_matches + tract_matches
if matched_rows and chunk_number % 10 == 0:
print(f" file_id={file_id}: chunk {chunk_number:,}, matched row-events={matched_rows:,}")
with conn.cursor() as cur:
cur.execute(
f"""
insert into {PROGRESS_TABLE} (
as_of_date, file_id, state_fips, technology_code,
technology_code_desc, record_count, matched_location_rows, provider_geo_rows
)
values (%s, %s, %s, %s, %s, %s, %s, %s)
on conflict (as_of_date, file_id) do update set
state_fips = excluded.state_fips,
technology_code = excluded.technology_code,
technology_code_desc = excluded.technology_code_desc,
record_count = excluded.record_count,
matched_location_rows = excluded.matched_location_rows,
provider_geo_rows = excluded.provider_geo_rows,
processed_at = now()
""",
(
as_of_date,
file_id,
file_row["state_fips"],
file_row["technology_code"],
file_row["technology_code_desc"],
file_row["record_count"],
matched_rows,
provider_geo_rows,
),
)
conn.commit()
return matched_rows, provider_geo_rows
finally:
zip_path.unlink(missing_ok=True)
def rebuild_aggregate(cur, as_of_date: date) -> int:
cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
cur.execute(
f"""
insert into {AGG_TABLE} (
as_of_date, geography_type, geoid,
provider_count, fiber_provider_count, cable_provider_count,
fixed_wireless_provider_count, copper_provider_count,
provider_100_20_count, business_provider_count,
business_fiber_provider_count, business_100_20_provider_count,
max_advertised_download_mbps, max_advertised_upload_mbps,
matched_location_rows, provider_file_rows
)
with per_provider as (
select
as_of_date,
geography_type,
geoid,
provider_id,
bool_or(has_fiber) as has_fiber,
bool_or(has_cable) as has_cable,
bool_or(has_fixed_wireless) as has_fixed_wireless,
bool_or(has_copper) as has_copper,
bool_or(has_100_20) as has_100_20,
bool_or(has_business) as has_business,
bool_or(has_business_fiber) as has_business_fiber,
bool_or(has_business_100_20) as has_business_100_20,
max(max_advertised_download_mbps) as max_advertised_download_mbps,
max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
sum(matched_location_rows) as matched_location_rows,
count(*) as provider_file_rows
from {DETAIL_TABLE}
where as_of_date = %s
group by 1, 2, 3, 4
)
select
as_of_date,
geography_type,
geoid,
count(*)::integer as provider_count,
count(*) filter (where has_fiber)::integer as fiber_provider_count,
count(*) filter (where has_cable)::integer as cable_provider_count,
count(*) filter (where has_fixed_wireless)::integer as fixed_wireless_provider_count,
count(*) filter (where has_copper)::integer as copper_provider_count,
count(*) filter (where has_100_20)::integer as provider_100_20_count,
count(*) filter (where has_business)::integer as business_provider_count,
count(*) filter (where has_business_fiber)::integer as business_fiber_provider_count,
count(*) filter (where has_business_100_20)::integer as business_100_20_provider_count,
max(max_advertised_download_mbps) as max_advertised_download_mbps,
max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
sum(matched_location_rows)::bigint as matched_location_rows,
sum(provider_file_rows)::bigint as provider_file_rows
from per_provider
group by 1, 2, 3
""",
(as_of_date,),
)
return cur.rowcount
def update_connection_table(cur, as_of_date: date) -> int:
cur.execute(
f"""
with joined as (
select
c.master_id,
coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11)) as provider_tract_geoid,
coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5)) as provider_county_geoid,
county.geoid as county_geoid,
tract.geoid as tract_geoid,
county.provider_count as county_provider_count,
county.fiber_provider_count as county_fiber_provider_count,
county.cable_provider_count as county_cable_provider_count,
county.fixed_wireless_provider_count as county_fixed_wireless_provider_count,
county.provider_100_20_count as county_100_20_provider_count,
county.business_provider_count as county_business_provider_count,
county.business_fiber_provider_count as county_business_fiber_provider_count,
county.business_100_20_provider_count as county_business_100_20_provider_count,
county.max_advertised_download_mbps as county_max_down,
county.max_advertised_upload_mbps as county_max_up,
tract.provider_count as tract_provider_count,
tract.fiber_provider_count as tract_fiber_provider_count,
tract.cable_provider_count as tract_cable_provider_count,
tract.fixed_wireless_provider_count as tract_fixed_wireless_provider_count,
tract.provider_100_20_count as tract_100_20_provider_count,
tract.business_provider_count as tract_business_provider_count,
tract.business_fiber_provider_count as tract_business_fiber_provider_count,
tract.business_100_20_provider_count as tract_business_100_20_provider_count,
tract.max_advertised_download_mbps as tract_max_down,
tract.max_advertised_upload_mbps as tract_max_up
from {CONNECTION_TABLE} c
left join {CROSSWALK_TABLE} x
on x.source_geography_type = 'Tract'
and x.fcc_geography_type = 'Tract'
and x.source_geoid = c.census_tract_geoid
left join {AGG_TABLE} county
on county.as_of_date = %s
and county.geography_type = 'County'
and county.geoid = coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5))
left join {AGG_TABLE} tract
on tract.as_of_date = %s
and tract.geography_type = 'Tract'
and tract.geoid = coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11))
)
update {CONNECTION_TABLE} c
set
fcc_provider_geography_type = case
when j.tract_geoid is not null then 'Tract'
when j.county_geoid is not null then 'County'
else c.fcc_provider_geography_type
end,
fcc_provider_geoid = coalesce(j.tract_geoid, j.county_geoid, c.fcc_provider_geoid),
fcc_provider_count = coalesce(j.tract_provider_count, j.county_provider_count),
fcc_fiber_provider_count = coalesce(j.tract_fiber_provider_count, j.county_fiber_provider_count),
fcc_cable_provider_count = coalesce(j.tract_cable_provider_count, j.county_cable_provider_count),
fcc_fixed_wireless_provider_count = coalesce(j.tract_fixed_wireless_provider_count, j.county_fixed_wireless_provider_count),
fcc_100_20_provider_count = coalesce(j.tract_100_20_provider_count, j.county_100_20_provider_count),
fcc_max_advertised_download_mbps = coalesce(j.tract_max_down, j.county_max_down, c.fcc_max_advertised_download_mbps),
fcc_max_advertised_upload_mbps = coalesce(j.tract_max_up, j.county_max_up, c.fcc_max_advertised_upload_mbps),
fcc_county_provider_count = j.county_provider_count,
fcc_county_fiber_provider_count = j.county_fiber_provider_count,
fcc_county_cable_provider_count = j.county_cable_provider_count,
fcc_county_fixed_wireless_provider_count = j.county_fixed_wireless_provider_count,
fcc_county_100_20_provider_count = j.county_100_20_provider_count,
fcc_county_business_provider_count = j.county_business_provider_count,
fcc_county_business_fiber_provider_count = j.county_business_fiber_provider_count,
fcc_county_business_100_20_provider_count = j.county_business_100_20_provider_count,
fcc_county_max_advertised_download_mbps = j.county_max_down,
fcc_county_max_advertised_upload_mbps = j.county_max_up,
fcc_tract_provider_count = j.tract_provider_count,
fcc_tract_fiber_provider_count = j.tract_fiber_provider_count,
fcc_tract_cable_provider_count = j.tract_cable_provider_count,
fcc_tract_fixed_wireless_provider_count = j.tract_fixed_wireless_provider_count,
fcc_tract_100_20_provider_count = j.tract_100_20_provider_count,
fcc_tract_business_provider_count = j.tract_business_provider_count,
fcc_tract_business_fiber_provider_count = j.tract_business_fiber_provider_count,
fcc_tract_business_100_20_provider_count = j.tract_business_100_20_provider_count,
fcc_tract_max_advertised_download_mbps = j.tract_max_down,
fcc_tract_max_advertised_upload_mbps = j.tract_max_up,
fcc_summary_json = jsonb_set(
coalesce(c.fcc_summary_json, '{{}}'::jsonb),
'{{location_provider_aggregate}}',
jsonb_build_object(
'source', 'fcc_state_location_coverage',
'as_of_date', %s::text,
'preferred_geography_type', case
when j.tract_geoid is not null then 'Tract'
when j.county_geoid is not null then 'County'
else null
end,
'preferred_geoid', coalesce(j.tract_geoid, j.county_geoid),
'county_geoid', j.county_geoid,
'tract_geoid', j.tract_geoid
),
true
),
fcc_bdc_status = case
when coalesce(j.tract_geoid, j.county_geoid) is not null then 'fcc_location_provider_joined'
else c.fcc_bdc_status
end,
updated_at = now()
from joined j
where c.master_id = j.master_id
and coalesce(j.tract_geoid, j.county_geoid) is not null
""",
(as_of_date, as_of_date, as_of_date),
)
return cur.rowcount
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--as-of-date", help="FCC availability as-of date; defaults to latest catalog date.")
parser.add_argument("--states", nargs="*", help="Optional state FIPS list, e.g. 11 34 51.")
parser.add_argument("--technology-codes", nargs="*", default=list(TERRESTRIAL_TECHNOLOGY_CODES))
parser.add_argument("--limit-files", type=int, help="Process only the first N matching files.")
parser.add_argument("--chunksize", type=int, default=500_000)
parser.add_argument("--refresh", action="store_true", help="Delete existing location-provider rows for this as-of date first.")
parser.add_argument("--no-resume", action="store_true", help="Reprocess files even if marked complete.")
parser.add_argument("--no-update-connection", action="store_true", help="Build aggregate tables but do not update data_center_broadband_connection.")
args = parser.parse_args()
load_zsh_secrets()
require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
as_of_date = parse_date(args.as_of_date) if args.as_of_date else None
if as_of_date is None and args.as_of_date:
raise RuntimeError(f"Invalid --as-of-date: {args.as_of_date}")
technology_codes = normalize_codes(args.technology_codes)
requested_states = tuple(s.zfill(2) for s in args.states) if args.states else None
with get_conn() as conn:
with conn.cursor() as cur:
create_tables(cur)
seed_geoid_crosswalk(cur)
as_of_date = as_of_date or latest_catalog_date(cur)
states, counties, tracts = target_geographies(cur, requested_states)
if not states:
raise RuntimeError("No target data-center states found.")
if args.refresh:
cur.execute(f"delete from {DETAIL_TABLE} where as_of_date = %s", (as_of_date,))
cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
cur.execute(f"delete from {PROGRESS_TABLE} where as_of_date = %s", (as_of_date,))
files = catalog_files(cur, as_of_date, states, technology_codes, args.limit_files)
conn.commit()
print(f"FCC as_of_date: {as_of_date}")
print(f"Target states: {len(states):,} | counties: {len(counties):,} | tracts: {len(tracts):,}")
print(f"Location coverage files selected: {len(files):,}")
total_matched_rows = 0
total_provider_geo_rows = 0
with tempfile.TemporaryDirectory(prefix="fcc_bdc_location_") as temp:
temp_dir = Path(temp)
for idx, file_row in enumerate(files, start=1):
file_id = file_row["file_id"]
with conn.cursor() as cur:
skip = (not args.no_resume) and progress_done(cur, as_of_date, file_id)
if skip:
print(f"[{idx:,}/{len(files):,}] skip file_id={file_id} already processed")
continue
print(
f"[{idx:,}/{len(files):,}] file_id={file_id} state={file_row['state_fips']} "
f"tech={file_row['technology_code']} records={file_row['record_count']:,}"
)
matched_rows, provider_geo_rows = process_file(
conn,
file_row,
as_of_date,
counties,
tracts,
args.chunksize,
temp_dir,
)
total_matched_rows += matched_rows
total_provider_geo_rows += provider_geo_rows
print(
f" complete file_id={file_id}: matched row-events={matched_rows:,}, "
f"provider-geography rows={provider_geo_rows:,}"
)
with conn.cursor() as cur:
agg_rows = rebuild_aggregate(cur, as_of_date)
updated_rows = 0
if not args.no_update_connection:
updated_rows = update_connection_table(cur, as_of_date)
conn.commit()
print(f"New matched row-events this run: {total_matched_rows:,}")
print(f"New provider-geography detail rows this run: {total_provider_geo_rows:,}")
print(f"{AGG_TABLE}: rebuilt {agg_rows:,} geography rows")
if not args.no_update_connection:
print(f"{CONNECTION_TABLE}: updated {updated_rows:,} rows with location-provider aggregates")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Build (or refresh) public.master_data_centers by merging:
- public.us_dc_sample_geocoded (curated, attribute-rich)
- public.osm_data_centers (OpenStreetMap features)
Deduplication rule (curated row wins):
Step 1: for each curated row, find a matching OSM row by
curated.id = osm.osm_id::text OR
curated.nominatim_osm_id = osm.osm_id OR
ST_DWithin(curated.geom, osm.geom, 150 m, geography)
(closest match by sphere distance when multiple).
Step 2: insert every curated row into master, filling NULLs from the
matched OSM row when present. source = 'merged' if matched,
otherwise 'curated'.
Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
source = 'osm'.
Result: every curated row appears once; OSM-only rows appear once; no row is
emitted twice. The merge logic lives in a SQL function
public.refresh_master_data_centers() so subsequent refreshes are one call.
"""
import argparse
import os
import sys
import psycopg2
DB_NAME = "data_centers"
MASTER_TABLE = "public.master_data_centers"
CURATED_TABLE = "public.us_dc_sample_geocoded"
OSM_TABLE = "public.osm_data_centers"
MATCH_RADIUS_M = 150
CREATE_TABLE_SQL = f"""
create table if not exists {MASTER_TABLE} (
master_id text primary key,
source text not null check (source in ('curated','osm','merged')),
curated_id text,
osm_id text,
name text,
operator text,
street_address text,
city text,
state text,
postal_code text,
country text,
website text,
phone text,
power_mw numeric,
area_sqft integer,
nearest_airport_miles numeric,
has_bare_metal boolean,
has_iaas boolean,
has_internet_exchange boolean,
has_colocation boolean,
certifications text,
content_summary text,
osm_tags jsonb,
matched_osm_tag_passes text[],
match_method text,
match_distance_m numeric,
longitude double precision not null,
latitude double precision not null,
geom geometry(Point, 4326)
generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
);
create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
"""
REFRESH_FUNCTION_SQL = f"""
create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
language plpgsql
as $$
begin
truncate table {MASTER_TABLE};
-- pick a single best OSM match for each curated row, prioritizing ID
-- equality, then nominatim id, then closest within radius
create temporary table _curated_to_osm on commit drop as
with ranked as (
select
c.id as curated_id,
o.id as osm_id,
case
when c.id = o.osm_id::text then 'id'
when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
else 'spatial'
end as method,
ST_DistanceSphere(c.geom, o.geom) as dist_m,
row_number() over (
partition by c.id
order by
case
when c.id = o.osm_id::text then 0
when c.nominatim_osm_id = o.osm_id then 1
else 2
end,
ST_DistanceSphere(c.geom, o.geom) asc
) as rn
from {CURATED_TABLE} c
join {OSM_TABLE} o
on c.id = o.osm_id::text
or c.nominatim_osm_id = o.osm_id
or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
)
select curated_id, osm_id, method, dist_m
from ranked
where rn = 1;
-- Step 1+2: insert curated rows (with OSM nulls filled where matched)
insert into {MASTER_TABLE} (
master_id, source, curated_id, osm_id,
name, operator, street_address, city, state, postal_code, country,
website, phone, power_mw, area_sqft, nearest_airport_miles,
has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
certifications, content_summary,
osm_tags, matched_osm_tag_passes,
match_method, match_distance_m,
longitude, latitude
)
select
'curated/' || c.id,
case when m.osm_id is not null then 'merged' else 'curated' end,
c.id,
m.osm_id,
coalesce(c.facility_name, o.name),
coalesce(c.provider, o.operator),
coalesce(c.street_address, o.street_address),
coalesce(c.city, o.city),
coalesce(c.state_code, o.state),
coalesce(c.postal_code, o.postal_code),
coalesce(c.country, o.country),
coalesce(c.url, o.website),
coalesce(c.phone, o.phone),
c.power_mw,
c.area_sqft,
c.nearest_airport_miles,
c.has_bare_metal,
c.has_iaas,
c.has_internet_exchange,
c.has_colocation,
c.certifications,
c.content_summary,
o.tags,
o.matched_tags,
m.method,
round(m.dist_m::numeric, 2),
c.longitude,
c.latitude
from {CURATED_TABLE} c
left join _curated_to_osm m on m.curated_id = c.id
left join {OSM_TABLE} o on o.id = m.osm_id;
-- Step 3: insert OSM rows that no curated row claimed
insert into {MASTER_TABLE} (
master_id, source, curated_id, osm_id,
name, operator, street_address, city, state, postal_code, country,
website, phone,
osm_tags, matched_osm_tag_passes,
longitude, latitude
)
select
'osm/' || o.id,
'osm',
null,
o.id,
o.name,
o.operator,
o.street_address,
o.city,
o.state,
o.postal_code,
o.country,
o.website,
o.phone,
o.tags,
o.matched_tags,
o.longitude,
o.latitude
from {OSM_TABLE} o
where not exists (
select 1 from _curated_to_osm m where m.osm_id = o.id
);
analyze {MASTER_TABLE};
return query
select
count(*) filter (where source = 'curated'),
count(*) filter (where source = 'merged'),
count(*) filter (where source = 'osm'),
count(*)
from {MASTER_TABLE};
end;
$$;
"""
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--radius-m",
type=float,
default=MATCH_RADIUS_M,
help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
)
parser.add_argument(
"--recreate",
action="store_true",
help=f"Drop and recreate {MASTER_TABLE} before building.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
if args.recreate:
cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
cur.execute(CREATE_TABLE_SQL)
cur.execute(REFRESH_FUNCTION_SQL)
cur.execute(
"select * from public.refresh_master_data_centers(%s)",
(args.radius_m,),
)
curated, merged, osm_only, total = cur.fetchone()
finally:
conn.close()
print(f"master_data_centers refreshed (radius={args.radius_m} m):")
print(f" curated-only rows: {curated}")
print(f" merged rows (curated + OSM): {merged}")
print(f" osm-only rows: {osm_only}")
print(f" total: {total}")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
import argparse
import os
import subprocess
from pathlib import Path
import psycopg2
DB_NAME = "data_centers"
TRACT_TABLE = "public.data_center_census_tracts_2024"
STAGE_TABLE = "public._watershed_huc8_stage"
HUC8_TABLE = "public.watershed_huc8"
LINK_TABLE = "public.census_tract_huc8_link"
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
def import_huc8_shapefile(shapefile_path):
conn_str = (
f"PG:host={os.environ['PGWEB_HOST']} "
f"port={os.environ['PGWEB_PORT']} "
f"user={os.environ['PGWEB_USER']} "
f"password={os.environ['PGWEB_PASSWORD']} "
f"dbname={DB_NAME}"
)
source = str(shapefile_path.resolve())
cmd = [
"ogr2ogr",
"-f",
"PostgreSQL",
conn_str,
source,
"-nln",
STAGE_TABLE,
"-nlt",
"MULTIPOLYGON",
"-t_srs",
"EPSG:4326",
"-lco",
"GEOMETRY_NAME=geom",
"-lco",
"FID=gid",
"-lco",
"PRECISION=NO",
"-unsetFieldWidth",
"-skipfailures",
"-overwrite",
]
subprocess.run(cmd, check=True)
def build_final_tables(conn):
with conn:
with conn.cursor() as cur:
cur.execute(f"drop table if exists {HUC8_TABLE}")
cur.execute(
f"""
create table {HUC8_TABLE} as
select distinct on (huc8)
huc8,
name,
states,
areaacres,
areasqkm,
loaddate,
sourceorig as sourceoriginator,
sourcedata as sourcedatadesc,
sourcefeat as sourcefeatureid,
metasource as metasourceid,
tnmid,
geom::geometry(MultiPolygon, 4326) as geom
from {STAGE_TABLE}
where huc8 is not null
order by huc8, loaddate desc nulls last
"""
)
cur.execute(f"alter table {HUC8_TABLE} add primary key (huc8)")
cur.execute(
f"create index watershed_huc8_geom_gix on {HUC8_TABLE} using gist (geom)"
)
cur.execute(
f"create index watershed_huc8_states_idx on {HUC8_TABLE} (states)"
)
cur.execute(f"drop table if exists {LINK_TABLE}")
cur.execute(
f"""
create table {LINK_TABLE} as
select
geoid,
huc8,
overlap_sqm,
overlap_sqm / 1000000.0 as overlap_sqkm,
overlap_sqm / nullif(tract_sqm, 0.0) as tract_overlap_pct
from (
select
tr.geoid,
wh.huc8,
st_area(
st_intersection(
tr.geom::geography,
wh.geom::geography
)
) as overlap_sqm,
st_area(tr.geom::geography) as tract_sqm
from {TRACT_TABLE} tr
join {HUC8_TABLE} wh
on st_intersects(tr.geom, wh.geom)
) as overlap_rows
where overlap_sqm > 0
"""
)
cur.execute(
f"create index census_tract_huc8_link_geoid_idx on {LINK_TABLE} (geoid)"
)
cur.execute(
f"create index census_tract_huc8_link_huc8_idx on {LINK_TABLE} (huc8)"
)
cur.execute(f"analyze {STAGE_TABLE}")
cur.execute(f"analyze {HUC8_TABLE}")
cur.execute(f"analyze {LINK_TABLE}")
def parse_args():
parser = argparse.ArgumentParser(
description=(
"Build watershed HUC8 boundaries and GEOID linkage tables from "
"a local HUC8 shapefile."
)
)
parser.add_argument(
"--shapefile",
default="HUC8_CONUS/HUC8_US.shp",
help="Path to the HUC8 shapefile to import.",
)
parser.add_argument(
"--build-only",
action="store_true",
help="Skip imports and rebuild final/link tables from existing stage data.",
)
return parser.parse_args()
def main():
args = parse_args()
shapefile_path = Path(args.shapefile)
if not args.build_only and not shapefile_path.exists():
raise FileNotFoundError(f"shapefile not found: {shapefile_path}")
if not args.build_only:
print(f"importing HUC8 shapefile from {shapefile_path}")
import_huc8_shapefile(shapefile_path)
conn = connect()
try:
build_final_tables(conn)
with conn.cursor() as cur:
cur.execute(f"select count(*) from {HUC8_TABLE}")
huc8_rows = cur.fetchone()[0]
cur.execute(f"select count(*) from {LINK_TABLE}")
link_rows = cur.fetchone()[0]
finally:
conn.close()
print(
f"done: source={shapefile_path}, huc8_rows={huc8_rows}, "
f"geoid_huc8_links={link_rows}"
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,731 @@
#!/usr/bin/env python3
import argparse
import csv
import json
import os
import subprocess
import urllib.parse
import urllib.request
from decimal import Decimal
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_values
DB_NAME = "data_centers"
POINT_TABLE = "public.master_data_centers"
POINT_ID_COL = "master_id"
BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
FINAL_TABLE = "public.data_center_census_tracts_2024"
ACS_YEAR = 2024
ACS_SOURCE = "ACS 2024 5-year profile"
TRACT_ZIP = Path("cb_2024_us_tract_500k.zip")
TRACT_ZIP_URL = (
"https://www2.census.gov/geo/tiger/GENZ2024/shp/cb_2024_us_tract_500k.zip"
)
ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
STATE_NAME_TO_CODE = {
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
"District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
"Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
"Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
"Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
"Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
"Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
"New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
"Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
"South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
"Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
"West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
"American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
"Puerto Rico": "PR", "United States Virgin Islands": "VI",
"U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
}
STATE_FIPS = {
"AL": "01",
"AK": "02",
"AZ": "04",
"AR": "05",
"CA": "06",
"CO": "08",
"CT": "09",
"DE": "10",
"DC": "11",
"FL": "12",
"GA": "13",
"HI": "15",
"ID": "16",
"IL": "17",
"IN": "18",
"IA": "19",
"KS": "20",
"KY": "21",
"LA": "22",
"ME": "23",
"MD": "24",
"MA": "25",
"MI": "26",
"MN": "27",
"MS": "28",
"MO": "29",
"MT": "30",
"NE": "31",
"NV": "32",
"NH": "33",
"NJ": "34",
"NM": "35",
"NY": "36",
"NC": "37",
"ND": "38",
"OH": "39",
"OK": "40",
"OR": "41",
"PA": "42",
"RI": "44",
"SC": "45",
"SD": "46",
"TN": "47",
"TX": "48",
"UT": "49",
"VT": "50",
"VA": "51",
"WA": "53",
"WV": "54",
"WI": "55",
"WY": "56",
"AS": "60",
"GU": "66",
"MP": "69",
"PR": "72",
"VI": "78",
}
ACS_VARIABLES = {
"DP05_0001E": "population",
"DP05_0018E": "median_age",
"DP02_0001E": "households",
"DP02_0016E": "avg_household_size",
"DP02_0067PE": "high_school_or_higher_pct",
"DP02_0068PE": "bachelor_or_higher_pct",
"DP02_0154PE": "broadband_subscription_pct",
"DP03_0001E": "population_16_over",
"DP03_0002E": "labor_force",
"DP03_0005E": "unemployed",
"DP03_0009PE": "unemployment_rate",
"DP03_0032E": "industry_total_workers",
"DP03_0033E": "industry_agriculture_mining_workers",
"DP03_0034E": "industry_construction_workers",
"DP03_0035E": "industry_manufacturing_workers",
"DP03_0036E": "industry_wholesale_trade_workers",
"DP03_0037E": "industry_retail_trade_workers",
"DP03_0038E": "industry_transportation_warehousing_utilities_workers",
"DP03_0039E": "industry_information_workers",
"DP03_0040E": "industry_finance_real_estate_workers",
"DP03_0041E": "industry_professional_management_admin_workers",
"DP03_0042E": "industry_education_health_social_workers",
"DP03_0043E": "industry_arts_entertainment_food_workers",
"DP03_0044E": "industry_other_services_workers",
"DP03_0045E": "industry_public_administration_workers",
"DP03_0062E": "median_household_income",
"DP03_0088E": "per_capita_income",
"DP03_0119PE": "family_poverty_rate",
"DP03_0128PE": "poverty_rate",
"DP05_0090E": "hispanic_latino_population",
"DP05_0090PE": "hispanic_latino_pct",
"DP05_0096E": "non_hispanic_white_population",
"DP05_0096PE": "non_hispanic_white_pct",
"DP05_0097E": "non_hispanic_black_population",
"DP05_0097PE": "non_hispanic_black_pct",
"DP05_0099E": "non_hispanic_asian_population",
"DP05_0099PE": "non_hispanic_asian_pct",
}
COUNT_COLUMNS = {
"population",
"households",
"population_16_over",
"labor_force",
"unemployed",
"industry_total_workers",
"industry_agriculture_mining_workers",
"industry_construction_workers",
"industry_manufacturing_workers",
"industry_wholesale_trade_workers",
"industry_retail_trade_workers",
"industry_transportation_warehousing_utilities_workers",
"industry_information_workers",
"industry_finance_real_estate_workers",
"industry_professional_management_admin_workers",
"industry_education_health_social_workers",
"industry_arts_entertainment_food_workers",
"industry_other_services_workers",
"industry_public_administration_workers",
"median_household_income",
"per_capita_income",
"hispanic_latino_population",
"non_hispanic_white_population",
"non_hispanic_black_population",
"non_hispanic_asian_population",
}
NUMERIC_COLUMNS = set(ACS_VARIABLES.values()) - COUNT_COLUMNS
INDUSTRY_COLUMNS = {
"industry_agriculture_mining_workers": "Agriculture, forestry, fishing and hunting, and mining",
"industry_construction_workers": "Construction",
"industry_manufacturing_workers": "Manufacturing",
"industry_wholesale_trade_workers": "Wholesale trade",
"industry_retail_trade_workers": "Retail trade",
"industry_transportation_warehousing_utilities_workers": "Transportation and warehousing, and utilities",
"industry_information_workers": "Information",
"industry_finance_real_estate_workers": "Finance and insurance, and real estate and rental and leasing",
"industry_professional_management_admin_workers": "Professional, scientific, management, administrative, and waste management services",
"industry_education_health_social_workers": "Educational services, and health care and social assistance",
"industry_arts_entertainment_food_workers": "Arts, entertainment, recreation, accommodation, and food services",
"industry_other_services_workers": "Other services, except public administration",
"industry_public_administration_workers": "Public administration",
}
SPECIAL_VALUES = {
"-666666666",
"-888888888",
"-999999999",
"-222222222",
"-333333333",
"-555555555",
"-666666666.0",
"-888888888.0",
"-999999999.0",
"-222222222.0",
"-333333333.0",
"-555555555.0",
}
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
def normalize_state(value):
if value in (None, ""):
return None
if value in STATE_FIPS:
return value
return STATE_NAME_TO_CODE.get(value.strip())
def get_state_fips(conn):
with conn.cursor() as cur:
cur.execute(
f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
)
rows = cur.fetchall()
normalized_counts = {}
null_state_count = 0
unknown = []
for raw, count in rows:
if raw is None:
null_state_count += count
continue
code = normalize_state(raw)
if code is None:
unknown.append((raw, count))
continue
normalized_counts[code] = normalized_counts.get(code, 0) + count
if unknown:
details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
if null_state_count:
print(
f"warning: {null_state_count} master_data_centers rows have NULL state; "
f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
)
# Census ACS 5-year DP profile lacks coverage for the small island territories;
# restrict to the 50 states + DC + PR which the ACS profile reliably serves.
allowed = {"AS", "GU", "MP", "VI"}
return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
return sorted({STATE_FIPS[code] for code in normalized_counts})
def ensure_final_table_absent(conn):
with conn.cursor() as cur:
cur.execute("select to_regclass(%s)", (FINAL_TABLE,))
if cur.fetchone()[0] is not None:
raise RuntimeError(
f"Target table {FINAL_TABLE} already exists; refusing to overwrite it."
)
def drop_final_table_if_exists(conn):
with conn:
with conn.cursor() as cur:
cur.execute(f"drop table if exists {FINAL_TABLE}")
def download_tract_boundaries():
if TRACT_ZIP.exists() and TRACT_ZIP.stat().st_size > 50_000_000:
return
tmp_path = TRACT_ZIP.with_suffix(".zip.part")
with urllib.request.urlopen(TRACT_ZIP_URL, timeout=120) as response:
with tmp_path.open("wb") as out:
while True:
chunk = response.read(1024 * 1024)
if not chunk:
break
out.write(chunk)
tmp_path.rename(TRACT_ZIP)
def import_tract_boundaries(state_fips):
where = "STATEFP IN ({})".format(
",".join(f"'{state}'" for state in sorted(state_fips))
)
env = os.environ.copy()
env.update(
{
"PGHOST": os.environ["PGWEB_HOST"],
"PGPORT": os.environ["PGWEB_PORT"],
"PGUSER": os.environ["PGWEB_USER"],
"PGPASSWORD": os.environ["PGWEB_PASSWORD"],
"PGDATABASE": DB_NAME,
}
)
cmd = [
"ogr2ogr",
"-f",
"PostgreSQL",
"PG:dbname=data_centers",
f"/vsizip/{TRACT_ZIP.resolve()}/cb_2024_us_tract_500k.shp",
"-nln",
BOUNDARY_STAGE_TABLE,
"-overwrite",
"-nlt",
"MULTIPOLYGON",
"-t_srs",
"EPSG:4326",
"-lco",
"GEOMETRY_NAME=geom",
"-lco",
"FID=gid",
"-where",
where,
]
subprocess.run(cmd, check=True, env=env)
def fetch_acs_for_state(state_fips):
variables = ["NAME", *ACS_VARIABLES.keys()]
params = {
"get": ",".join(variables),
"for": "tract:*",
"in": f"state:{state_fips} county:*",
}
api_key = os.environ.get("CENSUS_API_KEY")
if api_key:
params["key"] = api_key
url = (
f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
+ urllib.parse.urlencode(params)
)
try:
with urllib.request.urlopen(url, timeout=120) as response:
body = response.read().decode("utf-8")
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(
f"Census ACS request failed for state {state_fips}: HTTP {exc.code}{body[:300]}"
) from exc
try:
data = json.loads(body)
except json.JSONDecodeError as exc:
raise RuntimeError(
f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
) from exc
header = data[0]
rows = []
for values in data[1:]:
raw = dict(zip(header, values))
row = {
"geoid": raw["state"] + raw["county"] + raw["tract"],
"acs_name": raw["NAME"],
"statefp": raw["state"],
"countyfp": raw["county"],
"tractce": raw["tract"],
}
for acs_var, column in ACS_VARIABLES.items():
row[column] = clean_acs_value(raw.get(acs_var), column)
add_primary_industry(row)
rows.append(row)
return rows
def clean_acs_value(value, column):
if value in (None, "", "null") or value in SPECIAL_VALUES:
return None
if column in COUNT_COLUMNS:
return int(Decimal(value))
if column in NUMERIC_COLUMNS:
return Decimal(value)
return value
def add_primary_industry(row):
industry_total = row.get("industry_total_workers")
best_column = None
best_value = None
for column in INDUSTRY_COLUMNS:
value = row.get(column)
if value is None:
continue
if best_value is None or value > best_value:
best_column = column
best_value = value
row["primary_industry"] = INDUSTRY_COLUMNS.get(best_column)
row["primary_industry_workers"] = best_value
if industry_total and best_value is not None:
row["primary_industry_pct"] = Decimal(best_value * 100) / Decimal(industry_total)
else:
row["primary_industry_pct"] = None
def fetch_acs(state_fips):
rows = []
for state in state_fips:
rows.extend(fetch_acs_for_state(state))
fieldnames = [
"geoid",
"acs_name",
"statefp",
"countyfp",
"tractce",
*ACS_VARIABLES.values(),
"primary_industry",
"primary_industry_workers",
"primary_industry_pct",
]
with ACS_AUDIT_CSV.open("w", newline="", encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
return rows, fieldnames
def load_acs_stage(conn, rows, fieldnames):
with conn:
with conn.cursor() as cur:
cur.execute(f"drop table if exists {ACS_STAGE_TABLE}")
cur.execute(
f"""
create table {ACS_STAGE_TABLE} (
geoid text primary key,
acs_name text,
statefp text,
countyfp text,
tractce text,
population integer,
median_age numeric,
households integer,
avg_household_size numeric,
high_school_or_higher_pct numeric,
bachelor_or_higher_pct numeric,
broadband_subscription_pct numeric,
population_16_over integer,
labor_force integer,
unemployed integer,
unemployment_rate numeric,
industry_total_workers integer,
industry_agriculture_mining_workers integer,
industry_construction_workers integer,
industry_manufacturing_workers integer,
industry_wholesale_trade_workers integer,
industry_retail_trade_workers integer,
industry_transportation_warehousing_utilities_workers integer,
industry_information_workers integer,
industry_finance_real_estate_workers integer,
industry_professional_management_admin_workers integer,
industry_education_health_social_workers integer,
industry_arts_entertainment_food_workers integer,
industry_other_services_workers integer,
industry_public_administration_workers integer,
median_household_income integer,
per_capita_income integer,
family_poverty_rate numeric,
poverty_rate numeric,
hispanic_latino_population integer,
hispanic_latino_pct numeric,
non_hispanic_white_population integer,
non_hispanic_white_pct numeric,
non_hispanic_black_population integer,
non_hispanic_black_pct numeric,
non_hispanic_asian_population integer,
non_hispanic_asian_pct numeric,
primary_industry text,
primary_industry_workers integer,
primary_industry_pct numeric
)
"""
)
values = [tuple(row.get(column) for column in fieldnames) for row in rows]
execute_values(
cur,
f"insert into {ACS_STAGE_TABLE} ({', '.join(fieldnames)}) values %s",
values,
page_size=1000,
)
cur.execute(f"analyze {ACS_STAGE_TABLE}")
def create_final_table(conn):
with conn:
with conn.cursor() as cur:
cur.execute("drop index if exists _dc_census_tract_boundaries_2024_geom_gix")
cur.execute(
f"create index _dc_census_tract_boundaries_2024_geom_gix on {BOUNDARY_STAGE_TABLE} using gist (geom)"
)
cur.execute(f"analyze {BOUNDARY_STAGE_TABLE}")
cur.execute(
f"""
create table {FINAL_TABLE} as
with dc_tracts as (
select
t.geoid,
count(*)::integer as data_center_count,
count(*) filter (where dc.source = 'curated')::integer
as curated_only_data_center_count,
count(*) filter (where dc.source = 'merged')::integer
as merged_data_center_count,
count(*) filter (where dc.source = 'osm')::integer
as osm_only_data_center_count,
array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
array_agg(distinct dc.operator) filter (where dc.operator is not null)
as operators
from {BOUNDARY_STAGE_TABLE} t
join {POINT_TABLE} dc
on t.geom && dc.geom
and ST_Covers(t.geom, dc.geom)
group by t.geoid
)
select
t.geoid,
t.statefp,
t.countyfp,
t.tractce,
t.name as tract_name,
t.namelsad,
t.aland::bigint as land_area_sqm,
t.awater::bigint as water_area_sqm,
{ACS_YEAR}::integer as acs_year,
'{ACS_SOURCE}'::text as acs_source,
a.acs_name,
d.data_center_count,
d.curated_only_data_center_count,
d.merged_data_center_count,
d.osm_only_data_center_count,
d.data_center_ids,
d.operators,
a.population,
a.median_age,
a.households,
a.avg_household_size,
a.high_school_or_higher_pct,
a.bachelor_or_higher_pct,
a.broadband_subscription_pct,
a.population_16_over,
a.labor_force,
a.unemployed,
a.unemployment_rate,
a.median_household_income,
a.per_capita_income,
a.family_poverty_rate,
a.poverty_rate,
a.hispanic_latino_population,
a.hispanic_latino_pct,
a.non_hispanic_white_population,
a.non_hispanic_white_pct,
a.non_hispanic_black_population,
a.non_hispanic_black_pct,
a.non_hispanic_asian_population,
a.non_hispanic_asian_pct,
a.industry_total_workers,
a.industry_agriculture_mining_workers,
a.industry_construction_workers,
a.industry_manufacturing_workers,
a.industry_wholesale_trade_workers,
a.industry_retail_trade_workers,
a.industry_transportation_warehousing_utilities_workers,
a.industry_information_workers,
a.industry_finance_real_estate_workers,
a.industry_professional_management_admin_workers,
a.industry_education_health_social_workers,
a.industry_arts_entertainment_food_workers,
a.industry_other_services_workers,
a.industry_public_administration_workers,
a.primary_industry,
a.primary_industry_workers,
a.primary_industry_pct,
t.geom::geometry(MultiPolygon, 4326) as geom
from {BOUNDARY_STAGE_TABLE} t
join dc_tracts d on d.geoid = t.geoid
left join {ACS_STAGE_TABLE} a on a.geoid = t.geoid
"""
)
cur.execute(f"alter table {FINAL_TABLE} add primary key (geoid)")
cur.execute(
f"create index data_center_census_tracts_2024_geom_gix on {FINAL_TABLE} using gist (geom)"
)
cur.execute(
f"create index data_center_census_tracts_2024_state_county_idx on {FINAL_TABLE} (statefp, countyfp)"
)
cur.execute(
f"create index data_center_census_tracts_2024_dc_count_idx on {FINAL_TABLE} (data_center_count desc)"
)
cur.execute(
f"""
comment on table {FINAL_TABLE} is
'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
"""
)
cur.execute(f"analyze {FINAL_TABLE}")
def assign_point_geoids(conn):
with conn:
with conn.cursor() as cur:
cur.execute(
f"alter table {POINT_TABLE} add column if not exists geoid text"
)
cur.execute(
f"""
update {POINT_TABLE} dc
set geoid = matched.geoid
from (
select
dc_inner.{POINT_ID_COL} as point_id,
(
select t.geoid
from {BOUNDARY_STAGE_TABLE} t
where t.geom && dc_inner.geom
and st_covers(t.geom, dc_inner.geom)
order by t.geoid
limit 1
) as geoid
from {POINT_TABLE} dc_inner
) matched
where dc.{POINT_ID_COL} = matched.point_id
"""
)
cur.execute(
f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
)
cur.execute(f"analyze {POINT_TABLE}")
def validate(conn):
with conn.cursor() as cur:
cur.execute(
f"""
select
count(*)::integer as tract_rows,
coalesce(sum(data_center_count), 0)::integer as assigned_data_centers,
count(*) filter (where geom is not null)::integer as geom_rows
from {FINAL_TABLE}
"""
)
summary = cur.fetchone()
cur.execute(f"select count(*)::integer from {POINT_TABLE}")
total_points = cur.fetchone()[0]
cur.execute(
f"""
select source, count(*)::integer
from {POINT_TABLE}
group by source
order by source
"""
)
point_source_breakdown = cur.fetchall()
cur.execute(
f"""
select count(*)::integer
from {POINT_TABLE}
where geoid is null
"""
)
unassigned_points = cur.fetchone()[0]
cur.execute(
f"""
select count(*)::integer
from {FINAL_TABLE}
where population is null
"""
)
missing_acs = cur.fetchone()[0]
return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
def main():
parser = argparse.ArgumentParser(
description="Build census-tract enrichment table for data-center points."
)
parser.add_argument(
"--replace-final",
action="store_true",
help="Drop and rebuild the final tract table if it already exists.",
)
args = parser.parse_args()
conn = connect()
try:
state_fips = get_state_fips(conn)
if args.replace_final:
drop_final_table_if_exists(conn)
else:
ensure_final_table_absent(conn)
finally:
conn.close()
download_tract_boundaries()
import_tract_boundaries(state_fips)
acs_rows, acs_fieldnames = fetch_acs(state_fips)
conn = connect()
try:
if args.replace_final:
drop_final_table_if_exists(conn)
else:
ensure_final_table_absent(conn)
load_acs_stage(conn, acs_rows, acs_fieldnames)
create_final_table(conn)
assign_point_geoids(conn)
summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
finally:
conn.close()
print(f"loaded {len(acs_rows)} ACS tract rows into {ACS_STAGE_TABLE}")
print(f"created {FINAL_TABLE}")
print(
"tract_rows={0} assigned_data_centers={1} geom_rows={2} source_points={3}".format(
summary[0], summary[1], summary[2], total_points
)
)
print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
print(f"points_unassigned_to_tract={unassigned_points}")
print(f"tracts_missing_acs_population={missing_acs}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

686
scripts/ingest_legiscan.py Normal file
View File

@@ -0,0 +1,686 @@
#!/usr/bin/env python3
"""
Ingest LegiScan legislative datasets for all US states (2016-2026) into PostgreSQL.
Fetches all state session datasets from the LegiScan API, parses bill JSONs from
each ZIP archive, and loads them into the data_centers PostgreSQL database. Bills are
tagged with relevance categories (data_center, large_load, ratepayer_protection, etc.).
Usage:
python ingest_legiscan.py [--all | --setup-db | --fetch | --load | --tag]
[--state XX] [--year-start YYYY] [--dry-run] [--verbose]
Options:
--all Run all phases in sequence
--setup-db Create/update database tables and indexes
--fetch Download dataset ZIPs for all states (uses hash caching)
--load Parse cached ZIPs and insert/update bills in DB
--tag (Re)apply relevance tagging to all loaded bills
--state XX Restrict to one state (e.g., CA)
--year-start N Earliest session year to include (default: 2016)
--dry-run Print what would be done; no API calls or DB writes
--verbose Extra progress output
Environment:
LEGISCAN_API_KEY Required
PGWEB_HOST, PGWEB_PORT,
PGWEB_USER, PGWEB_PASSWORD PostgreSQL connection (DB: data_centers)
"""
import argparse
import base64
import io
import json
import logging
import os
import sys
import time
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Optional
import psycopg2
import psycopg2.extras
import requests
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DB_NAME = "data_centers"
API_KEY = os.environ.get("LEGISCAN_API_KEY")
API_BASE = "https://api.legiscan.com/"
CACHE_DIR = Path("data/legiscan_cache")
MIN_YEAR_DEFAULT = 2016
RATE_LIMIT_DELAY = 0.5 # seconds between API calls
# Keyword categories for relevance tagging.
# Keys become the tag values stored in legiscan_bills.relevance_tags[].
RELEVANCE_KEYWORDS: dict[str, list[str]] = {
"data_center": [
"data center", "data centre", "hyperscale", "colocation", "colo facility",
"server farm", "cloud computing facility", "internet exchange",
"carrier hotel", "artificial intelligence facility", "ai campus",
"ai data center", "gpu cluster", "compute facility",
"high performance computing", "hpc facility", "data hall",
"network access point", "data warehousing facility",
],
"large_load": [
"large load", "large power consumer", "large electricity consumer",
"high electricity consumption", "high power consumption",
"megawatt load", "gigawatt load", "cryptocurrency mining",
"bitcoin mining", "blockchain mining", "crypto mining",
"digital asset mining", "proof of work", "electric arc furnace",
"large industrial customer", "high-density load", "new large load",
"load growth", "extraordinary load",
],
"ratepayer_protection": [
"ratepayer", "rate payer", "cost shift", "cost shifting",
"cost allocation", "cross-subsidy", "cross subsidy",
"rate design", "rate structure", "electricity rate",
"electric rate", "utility rate", "rate increase", "rate burden",
"rate base", "stranded cost", "rate class", "customer protection",
"consumer protection", "electric customer", "residential customer",
"demand charge", "transmission cost", "grid upgrade cost",
"interconnection cost", "cost recovery", "rate relief",
"affordability", "energy burden",
],
"grid_impact": [
"grid reliability", "grid stability", "grid congestion",
"grid modernization", "grid infrastructure", "electric grid",
"power grid", "electricity grid", "transmission upgrade",
"transmission expansion", "interconnection queue",
"interconnection study", "demand response", "curtailment",
"grid capacity", "system reliability", "capacity expansion",
"electric system", "power system reliability", "grid resilience",
"grid planning", "integrated resource plan",
],
"water_use": [
"water consumption", "cooling water", "water efficiency",
"water use effectiveness", "evaporative cooling",
"water withdrawal", "water discharge", "water impact",
"water footprint", "cooling tower", "water-cooled",
"once-through cooling", "recycled water", "water stress",
"water scarcity",
],
"tax_incentive": [
"tax credit", "tax exemption", "tax abatement", "tax incentive",
"sales tax exemption", "property tax exemption", "tax break",
"tax relief", "enterprise zone", "economic incentive",
"business incentive", "investment credit", "job creation credit",
"economic development incentive", "opportunity zone",
"tax subsidy",
],
"energy_policy": [
"renewable energy", "clean energy", "energy efficiency",
"power purchase agreement", " ppa ", "green tariff",
"clean power", "carbon neutral", "net zero", "decarbonization",
"energy procurement", "24/7 clean energy", "carbon-free",
"clean electricity", "energy storage", "virtual power plant",
"net metering", "green power",
],
"siting_permitting": [
"conditional use permit", "special use permit", "land use permit",
"zoning", "facility siting", "environmental review",
"environmental impact", "noise ordinance", "setback requirement",
"building permit", "construction permit", "site approval",
"local approval", "permit requirement", "permitting process",
"local control", "preemption",
],
}
# Status code labels (LegiScan)
STATUS_LABELS = {
0: "N/A", 1: "Introduced", 2: "Engrossed", 3: "Enrolled",
4: "Passed", 5: "Vetoed", 6: "Failed", 7: "Override",
8: "Chaptered", 9: "Referred", 10: "Report Pass",
11: "Report DNP", 12: "Draft",
}
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Database
# ---------------------------------------------------------------------------
def get_db_connection():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
DDL = """
CREATE TABLE IF NOT EXISTS legiscan_sessions (
session_id INTEGER PRIMARY KEY,
state_id INTEGER NOT NULL,
state_abbr VARCHAR(2) NOT NULL,
year_start INTEGER NOT NULL,
year_end INTEGER NOT NULL,
session_title TEXT,
session_tag TEXT,
is_special BOOLEAN DEFAULT FALSE,
is_prior BOOLEAN DEFAULT FALSE,
dataset_hash VARCHAR(32),
dataset_date DATE,
dataset_size_mb FLOAT,
bill_count INTEGER DEFAULT 0,
imported_at TIMESTAMPTZ
);
CREATE TABLE IF NOT EXISTS legiscan_bills (
bill_id INTEGER PRIMARY KEY,
session_id INTEGER REFERENCES legiscan_sessions(session_id),
state VARCHAR(2) NOT NULL,
bill_number VARCHAR(50),
bill_type VARCHAR(10),
title TEXT,
description TEXT,
status INTEGER,
status_date DATE,
completed INTEGER DEFAULT 0,
body VARCHAR(10),
url TEXT,
state_link TEXT,
change_hash VARCHAR(32),
subjects TEXT[],
sponsor_count INTEGER DEFAULT 0,
vote_count INTEGER DEFAULT 0,
text_count INTEGER DEFAULT 0,
is_relevant BOOLEAN DEFAULT FALSE,
relevance_tags TEXT[],
imported_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ls_bills_state ON legiscan_bills(state);
CREATE INDEX IF NOT EXISTS idx_ls_bills_session ON legiscan_bills(session_id);
CREATE INDEX IF NOT EXISTS idx_ls_bills_status ON legiscan_bills(status);
CREATE INDEX IF NOT EXISTS idx_ls_bills_relevant ON legiscan_bills(is_relevant) WHERE is_relevant;
CREATE INDEX IF NOT EXISTS idx_ls_bills_subjects ON legiscan_bills USING gin(subjects);
CREATE INDEX IF NOT EXISTS idx_ls_bills_rtags ON legiscan_bills USING gin(relevance_tags);
CREATE INDEX IF NOT EXISTS idx_ls_bills_fts ON legiscan_bills
USING gin(to_tsvector('english',
COALESCE(title, '') || ' ' || COALESCE(description, '')));
"""
def setup_db(conn):
with conn.cursor() as cur:
cur.execute(DDL)
conn.commit()
log.info("Database tables and indexes ready.")
# ---------------------------------------------------------------------------
# LegiScan API helpers
# ---------------------------------------------------------------------------
def _api_get(params: dict, timeout: int = 120) -> dict:
"""Make one LegiScan API call and return the parsed JSON."""
params["key"] = API_KEY
resp = requests.get(API_BASE, params=params, timeout=timeout)
resp.raise_for_status()
data = resp.json()
if data.get("status") != "OK":
raise RuntimeError(f"LegiScan API error: {data}")
return data
def get_all_dataset_metadata(year_start: int, state_filter: Optional[str] = None) -> list[dict]:
"""Fetch full dataset list (one API call), filter to year_start+."""
log.info("Fetching dataset list from LegiScan…")
data = _api_get({"op": "getDatasetList"})
sessions = data["datasetlist"]
log.info(f" Total sessions across all states: {len(sessions)}")
sessions = [s for s in sessions if s["year_start"] >= year_start]
if state_filter:
# Need to map state abbr → state_id. Derive from a quick per-state call.
log.info(f" Filtering to state {state_filter}")
state_data = _api_get({"op": "getDatasetList", "state": state_filter})
valid_ids = {s["session_id"] for s in state_data["datasetlist"]}
sessions = [s for s in sessions if s["session_id"] in valid_ids]
log.info(f" Sessions matching filters: {len(sessions)}")
return sessions
def download_dataset_zip(session: dict, dry_run: bool = False) -> tuple[Optional[bytes], bool]:
"""Download a dataset ZIP via the API; cache to disk.
Returns (zip_bytes, api_call_made) — api_call_made is True only when the
network was actually hit so the caller can rate-limit appropriately."""
session_id = session["session_id"]
dataset_hash = session["dataset_hash"]
access_key = session["access_key"]
CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_path = CACHE_DIR / f"{session_id}_{dataset_hash}.zip"
if cache_path.exists():
log.debug(f" Cache hit: {cache_path.name}")
return cache_path.read_bytes(), False
if dry_run:
log.info(f" [dry-run] Would download session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)")
return None, False
log.info(f" Downloading session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)…")
data = _api_get({"op": "getDataset", "access_key": access_key, "id": session_id})
zip_bytes = base64.b64decode(data["dataset"]["zip"])
cache_path.write_bytes(zip_bytes)
log.info(f" Cached → {cache_path.name}")
return zip_bytes, True
return zip_bytes
# ---------------------------------------------------------------------------
# Relevance tagging
# ---------------------------------------------------------------------------
def score_relevance(title: str, description: str, subjects: list[str]) -> tuple[bool, list[str]]:
"""Return (is_relevant, list_of_matched_tags)."""
haystack = " ".join([
(title or "").lower(),
(description or "").lower(),
" ".join(s.lower() for s in subjects),
])
tags = []
for tag, keywords in RELEVANCE_KEYWORDS.items():
if any(kw in haystack for kw in keywords):
tags.append(tag)
return bool(tags), tags
# ---------------------------------------------------------------------------
# ZIP processing and DB loading
# ---------------------------------------------------------------------------
def _state_abbr_from_zip(zf: zipfile.ZipFile) -> str:
"""Extract the state abbreviation from the ZIP's path structure."""
for name in zf.namelist():
parts = name.split("/")
if len(parts) >= 1 and len(parts[0]) == 2:
return parts[0]
return "??"
def process_dataset(
session: dict,
zip_bytes: bytes,
conn,
state_abbr: Optional[str] = None,
dry_run: bool = False,
verbose: bool = False,
) -> int:
"""Parse all bill JSONs from a ZIP and upsert into legiscan_bills. Returns count."""
session_id = session["session_id"]
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
if not state_abbr:
state_abbr = _state_abbr_from_zip(zf)
bill_files = [n for n in zf.namelist() if "/bill/" in n and n.endswith(".json")]
if not bill_files:
log.warning(f" Session {session_id}: no bill files found in ZIP.")
return 0
rows = []
for fname in bill_files:
try:
raw = json.loads(zf.read(fname))
b = raw.get("bill", raw)
except Exception as e:
log.warning(f" Could not parse {fname}: {e}")
continue
subjects = [s["subject_name"] for s in (b.get("subjects") or []) if s.get("subject_name")]
is_rel, tags = score_relevance(
b.get("title", ""),
b.get("description", ""),
subjects,
)
status_date = b.get("status_date") or None
rows.append((
b["bill_id"],
session_id,
b.get("state", state_abbr),
b.get("bill_number"),
b.get("bill_type"),
b.get("title"),
b.get("description"),
b.get("status"),
status_date,
b.get("completed", 0),
b.get("body"),
b.get("url"),
b.get("state_link"),
b.get("change_hash"),
subjects or None,
len(b.get("sponsors") or []),
len(b.get("votes") or []),
len(b.get("texts") or []),
is_rel,
tags or None,
))
if dry_run:
log.info(f" [dry-run] Session {session_id} ({state_abbr}): would insert/update {len(rows)} bills")
return len(rows)
UPSERT = """
INSERT INTO legiscan_bills (
bill_id, session_id, state, bill_number, bill_type,
title, description, status, status_date, completed,
body, url, state_link, change_hash, subjects,
sponsor_count, vote_count, text_count,
is_relevant, relevance_tags, imported_at
) VALUES %s
ON CONFLICT (bill_id) DO UPDATE SET
change_hash = EXCLUDED.change_hash,
status = EXCLUDED.status,
status_date = EXCLUDED.status_date,
completed = EXCLUDED.completed,
subjects = EXCLUDED.subjects,
sponsor_count = EXCLUDED.sponsor_count,
vote_count = EXCLUDED.vote_count,
text_count = EXCLUDED.text_count,
is_relevant = EXCLUDED.is_relevant,
relevance_tags = EXCLUDED.relevance_tags,
imported_at = NOW()
WHERE legiscan_bills.change_hash IS DISTINCT FROM EXCLUDED.change_hash
"""
template = "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())"
with conn.cursor() as cur:
psycopg2.extras.execute_values(cur, UPSERT, rows, template=template, page_size=500)
count = cur.rowcount
# Update session bill_count
with conn.cursor() as cur:
cur.execute(
"UPDATE legiscan_sessions SET bill_count = %s, imported_at = NOW() WHERE session_id = %s",
(len(rows), session_id),
)
conn.commit()
if verbose:
relevant = sum(1 for r in rows if r[18])
log.info(f" Session {session_id} ({state_abbr}): {len(rows)} bills, {relevant} relevant, {count} upserted")
return len(rows)
def upsert_session(session: dict, state_abbr: str, conn, dry_run: bool = False):
"""Insert or update a session record."""
if dry_run:
return
with conn.cursor() as cur:
cur.execute("""
INSERT INTO legiscan_sessions
(session_id, state_id, state_abbr, year_start, year_end,
session_title, session_tag, is_special, is_prior,
dataset_hash, dataset_date, dataset_size_mb)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (session_id) DO UPDATE SET
dataset_hash = EXCLUDED.dataset_hash,
dataset_date = EXCLUDED.dataset_date,
dataset_size_mb = EXCLUDED.dataset_size_mb,
session_title = EXCLUDED.session_title
""", (
session["session_id"],
session["state_id"],
state_abbr,
session["year_start"],
session["year_end"],
session.get("session_title"),
session.get("session_tag"),
bool(session.get("special")),
bool(session.get("prior")),
session.get("dataset_hash"),
session.get("dataset_date"),
session.get("dataset_size", 0) / 1e6,
))
conn.commit()
def needs_import(session: dict, conn) -> bool:
"""Return True if this session's dataset_hash differs from what's in the DB."""
with conn.cursor() as cur:
cur.execute(
"SELECT dataset_hash FROM legiscan_sessions WHERE session_id = %s",
(session["session_id"],),
)
row = cur.fetchone()
if row is None:
return True
return row[0] != session["dataset_hash"]
# ---------------------------------------------------------------------------
# Retag phase
# ---------------------------------------------------------------------------
def retag_all_bills(conn, dry_run: bool = False, verbose: bool = False):
"""Re-score relevance for every bill already in the DB."""
log.info("Re-tagging all bills…")
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("SELECT bill_id, title, description, subjects FROM legiscan_bills")
rows = cur.fetchall()
log.info(f" Scoring {len(rows)} bills…")
updates = []
for row in rows:
is_rel, tags = score_relevance(
row["title"] or "",
row["description"] or "",
row["subjects"] or [],
)
updates.append((is_rel, tags or None, row["bill_id"]))
if dry_run:
relevant = sum(1 for u in updates if u[0])
log.info(f" [dry-run] Would tag {relevant}/{len(updates)} bills as relevant")
return
with conn.cursor() as cur:
psycopg2.extras.execute_values(
cur,
"UPDATE legiscan_bills SET is_relevant = data.is_rel, relevance_tags = data.tags "
"FROM (VALUES %s) AS data(is_rel, tags, bill_id) "
"WHERE legiscan_bills.bill_id = data.bill_id::integer",
updates,
template="(%s, %s::text[], %s)",
)
conn.commit()
relevant = sum(1 for u in updates if u[0])
log.info(f" Tagged {relevant}/{len(updates)} bills as relevant.")
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
def print_summary(conn):
queries = {
"Total sessions": "SELECT COUNT(*) FROM legiscan_sessions",
"Total bills": "SELECT COUNT(*) FROM legiscan_bills",
"Relevant bills": "SELECT COUNT(*) FROM legiscan_bills WHERE is_relevant",
"States covered": "SELECT COUNT(DISTINCT state) FROM legiscan_bills",
}
print("\n--- LegiScan ingestion summary ---")
with conn.cursor() as cur:
for label, sql in queries.items():
cur.execute(sql)
print(f" {label}: {cur.fetchone()[0]:,}")
# Top relevance tags
with conn.cursor() as cur:
cur.execute("""
SELECT tag, COUNT(*) AS n
FROM legiscan_bills, unnest(relevance_tags) AS tag
GROUP BY tag ORDER BY n DESC
""")
rows = cur.fetchall()
if rows:
print("\n Relevant bills by tag:")
for tag, n in rows:
print(f" {tag:<30} {n:>6,}")
# Top states for relevant bills
with conn.cursor() as cur:
cur.execute("""
SELECT state, COUNT(*) AS n
FROM legiscan_bills WHERE is_relevant
GROUP BY state ORDER BY n DESC LIMIT 15
""")
rows = cur.fetchall()
if rows:
print("\n Top states by relevant bill count:")
for state, n in rows:
print(f" {state} {n:>5,}")
print()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def parse_args():
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--all", action="store_true", help="Run setup-db + fetch + load + tag")
p.add_argument("--setup-db", action="store_true", help="Create/update DB tables")
p.add_argument("--fetch", action="store_true", help="Download dataset ZIPs")
p.add_argument("--load", action="store_true", help="Load cached ZIPs into DB")
p.add_argument("--tag", action="store_true", help="Retag all bills for relevance")
p.add_argument("--state", default=None, metavar="XX", help="Limit to one state")
p.add_argument("--year-start", type=int, default=MIN_YEAR_DEFAULT, dest="year_start")
p.add_argument("--dry-run", action="store_true")
p.add_argument("--verbose", action="store_true")
return p.parse_args()
def main():
args = parse_args()
if args.verbose:
log.setLevel(logging.DEBUG)
if not API_KEY:
log.error("LEGISCAN_API_KEY is not set.")
sys.exit(1)
do_setup = args.all or args.setup_db
do_fetch = args.all or args.fetch
do_load = args.all or args.load
do_tag = args.all or args.tag
if not any([do_setup, do_fetch, do_load, do_tag]):
log.error("Specify at least one phase: --all, --setup-db, --fetch, --load, --tag")
sys.exit(1)
conn = None if args.dry_run else get_db_connection()
# ── Setup ──────────────────────────────────────────────────────────────
if do_setup:
if args.dry_run:
log.info("[dry-run] Would create legiscan_sessions and legiscan_bills tables.")
else:
setup_db(conn)
# ── Fetch + Load (interleaved per session for memory efficiency) ────────
if do_fetch or do_load:
sessions = get_all_dataset_metadata(args.year_start, state_filter=args.state)
total = len(sessions)
log.info(f"Processing {total} sessions (year_start ≥ {args.year_start})…")
total_bills = 0
skipped = 0
for i, session in enumerate(sessions, 1):
session_id = session["session_id"]
state_id = session["state_id"]
year_start = session["year_start"]
title = session.get("session_title", "")
# Check if import needed
if do_load and not args.dry_run and conn and not needs_import(session, conn):
log.debug(f" [{i}/{total}] Session {session_id} ({title}) — hash unchanged, skipping.")
skipped += 1
continue
log.info(f"[{i}/{total}] Session {session_id}: {title}")
# Download
zip_bytes = None
if do_fetch:
try:
zip_bytes, api_called = download_dataset_zip(session, dry_run=args.dry_run)
if api_called:
time.sleep(RATE_LIMIT_DELAY)
except Exception as e:
log.error(f" Download failed for session {session_id}: {e}")
continue
elif do_load:
# Load from cache only
cache_path = CACHE_DIR / f"{session_id}_{session['dataset_hash']}.zip"
if not cache_path.exists():
log.warning(f" Cache miss for session {session_id} — run --fetch first.")
continue
zip_bytes = cache_path.read_bytes()
# Derive state abbreviation from ZIP structure
state_abbr = args.state
if zip_bytes and not state_abbr:
try:
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
state_abbr = _state_abbr_from_zip(zf)
except Exception:
state_abbr = "??"
# Upsert session record
if do_load and not args.dry_run and conn and state_abbr:
upsert_session(session, state_abbr, conn, dry_run=args.dry_run)
# Load bills
if do_load and zip_bytes:
try:
n = process_dataset(
session, zip_bytes, conn,
state_abbr=state_abbr,
dry_run=args.dry_run,
verbose=args.verbose,
)
total_bills += n
except Exception as e:
log.error(f" Load failed for session {session_id}: {e}")
if conn:
conn.rollback()
log.info(f"Fetch/load complete. Bills processed: {total_bills:,}. Skipped (up-to-date): {skipped}.")
# ── Tag ────────────────────────────────────────────────────────────────
if do_tag and not (do_fetch or do_load):
if args.dry_run or conn:
retag_all_bills(conn, dry_run=args.dry_run, verbose=args.verbose)
# ── Summary ────────────────────────────────────────────────────────────
if conn and not args.dry_run:
print_summary(conn)
conn.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,293 @@
#!/usr/bin/env python3
import argparse
import csv
import os
from decimal import Decimal
import psycopg2
from psycopg2.extras import execute_values
CSV_PATH = "US_DC_Sample_geocoded.csv"
IM3_CSV_PATH = "new/IM3_Existing_DataCenters.csv"
TABLE = "public.us_dc_sample_geocoded"
DB_NAME = "data_centers"
ALL_COLS = [
"id",
"provider",
"facility_name",
"url",
"provider_url",
"country",
"state",
"state_code",
"city",
"postal_code",
"street_address",
"address",
"source_address",
"phone",
"area_sqft",
"power_mw",
"nearest_airport_miles",
"has_bare_metal",
"has_iaas",
"has_internet_exchange",
"has_colocation",
"certifications",
"content_summary",
"path",
"longitude",
"latitude",
"geocode_source",
"geocode_precision",
"geocode_status",
"geocode_match_address",
"census_status",
"census_match_type",
"census_input_address",
"census_tiger_line_id",
"census_side",
"nominatim_display_name",
"nominatim_osm_type",
"nominatim_osm_id",
]
INT_COLS = {"area_sqft", "census_tiger_line_id", "nominatim_osm_id"}
NUM_COLS = {"power_mw", "nearest_airport_miles", "longitude", "latitude"}
BOOL_COLS = {
"has_bare_metal",
"has_iaas",
"has_internet_exchange",
"has_colocation",
}
def to_int(value):
if value in (None, ""):
return None
return int(Decimal(value))
def to_decimal(value):
return Decimal(value) if value not in (None, "") else None
def to_bool(value):
return bool(int(value)) if value not in (None, "") else None
def convert(row, column):
value = row.get(column)
if column in INT_COLS:
return to_int(value)
if column in NUM_COLS:
return to_decimal(value)
if column in BOOL_COLS:
return to_bool(value)
return None if value == "" else value
def normalize_geocoded_row(row):
return {column: row.get(column, "") for column in ALL_COLS}
def normalize_im3_row(row):
return {
"id": row.get("id", ""),
"provider": row.get("operator", ""),
"facility_name": row.get("name", ""),
"url": "",
"provider_url": "",
"country": "United States",
"state": row.get("state", ""),
"state_code": row.get("state_abb", ""),
"city": "",
"postal_code": "",
"street_address": "",
"address": "",
"source_address": "",
"phone": "",
"area_sqft": row.get("sqft", ""),
"power_mw": "",
"nearest_airport_miles": "",
"has_bare_metal": "",
"has_iaas": "",
"has_internet_exchange": "",
"has_colocation": "",
"certifications": "",
"content_summary": "",
"path": "IM3_Existing_DataCenters.csv",
"longitude": row.get("lon", ""),
"latitude": row.get("lat", ""),
"geocode_source": "IM3_Existing_DataCenters",
"geocode_precision": row.get("type", "") or "im3",
"geocode_status": "im3_imported",
"geocode_match_address": "",
"census_status": "",
"census_match_type": "",
"census_input_address": "",
"census_tiger_line_id": "",
"census_side": "",
"nominatim_display_name": "",
"nominatim_osm_type": "",
"nominatim_osm_id": "",
}
def read_and_normalize_rows(csv_path, source):
with open(csv_path, newline="", encoding="utf-8") as csv_file:
rows = list(csv.DictReader(csv_file))
if source == "im3":
normalized = [normalize_im3_row(row) for row in rows]
else:
normalized = [normalize_geocoded_row(row) for row in rows]
deduped = {}
for row in normalized:
row_id = (row.get("id") or "").strip()
if not row_id:
continue
deduped[row_id] = row
values = [tuple(convert(row, column) for column in ALL_COLS) for row in deduped.values()]
return rows, values
def create_table(cur):
cur.execute(
f"""
create table {TABLE} (
id text primary key,
provider text,
facility_name text,
url text,
provider_url text,
country text,
state text,
state_code text,
city text,
postal_code text,
street_address text,
address text,
source_address text,
phone text,
area_sqft integer,
power_mw numeric,
nearest_airport_miles numeric,
has_bare_metal boolean,
has_iaas boolean,
has_internet_exchange boolean,
has_colocation boolean,
certifications text,
content_summary text,
path text,
longitude double precision not null,
latitude double precision not null,
geocode_source text,
geocode_precision text,
geocode_status text,
geocode_match_address text,
census_status text,
census_match_type text,
census_input_address text,
census_tiger_line_id bigint,
census_side text,
nominatim_display_name text,
nominatim_osm_type text,
nominatim_osm_id bigint,
geom geometry(Point, 4326) generated always as
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
)
"""
)
def insert_values(cur, values, upsert):
insert_sql = f"insert into {TABLE} ({', '.join(ALL_COLS)}) values %s"
if upsert:
update_cols = [col for col in ALL_COLS if col != "id"]
assignments = ", ".join(f"{col} = excluded.{col}" for col in update_cols)
insert_sql += f" on conflict (id) do update set {assignments}"
execute_values(cur, insert_sql, values, page_size=100)
def parse_args():
parser = argparse.ArgumentParser(
description="Load data-center CSV data into public.us_dc_sample_geocoded."
)
parser.add_argument(
"--source",
choices=["geocoded", "im3"],
default="geocoded",
help="Input schema type. Use 'im3' for new/IM3_Existing_DataCenters.csv.",
)
parser.add_argument(
"--csv-path",
help="Override input CSV path. If omitted, uses a source-specific default.",
)
parser.add_argument(
"--append",
action="store_true",
help="Append/upsert into an existing target table instead of creating a new one.",
)
parser.add_argument(
"--upsert",
action="store_true",
help="On id conflicts, update the existing row. Recommended with --append.",
)
return parser.parse_args()
def main():
args = parse_args()
default_csv = IM3_CSV_PATH if args.source == "im3" else CSV_PATH
csv_path = args.csv_path or default_csv
rows, values = read_and_normalize_rows(csv_path, args.source)
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
cur.execute("select to_regclass(%s)", (TABLE,))
table_exists = cur.fetchone()[0] is not None
if not table_exists:
create_table(cur)
cur.execute(
f"create index us_dc_sample_geocoded_geom_gix on {TABLE} using gist (geom)"
)
cur.execute(
f"create index us_dc_sample_geocoded_state_city_idx on {TABLE} (state_code, city)"
)
elif not args.append:
raise RuntimeError(
f"Target table {TABLE} already exists; use --append to add data."
)
insert_values(cur, values, upsert=args.upsert)
cur.execute(f"analyze {TABLE}")
finally:
conn.close()
source_label = "IM3-adapted" if args.source == "im3" else "geocoded"
mode = "append" if args.append else "create"
conflict_mode = "upsert" if args.upsert else "insert"
print(
f"loaded {len(values)} {source_label} rows into {TABLE} "
f"(mode={mode}, conflict={conflict_mode}, csv={csv_path})"
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,428 @@
#!/usr/bin/env python3
"""Load internet_cables/*.json into PostGIS.
Reads:
- internet_cables/all_cables.json -> public.internet_cables (+ landing points)
- internet_cables/city_dominance_2026.json -> public.internet_city_dominance
- internet_cables/year-summaries.json -> public.internet_cable_year_summaries
- internet_cables/meta.json -> public.internet_cable_meta
Requires env vars: PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD.
"""
import argparse
import json
import os
import re
from decimal import Decimal
import psycopg2
from psycopg2.extras import Json, execute_values
DATA_DIR = "internet_cables"
DB_NAME = "data_centers"
CABLES_TABLE = "public.internet_cables"
LANDINGS_TABLE = "public.internet_cable_landing_points"
CITY_TABLE = "public.internet_city_dominance"
YEAR_TABLE = "public.internet_cable_year_summaries"
META_TABLE = "public.internet_cable_meta"
LENGTH_KM_RE = re.compile(r"([\d,\.]+)\s*km", re.IGNORECASE)
def parse_length_km(raw):
if not raw:
return None
match = LENGTH_KM_RE.search(raw)
if not match:
return None
try:
return Decimal(match.group(1).replace(",", ""))
except Exception:
return None
def to_int(value):
if value in (None, ""):
return None
try:
return int(value)
except (TypeError, ValueError):
return None
def to_bool(value):
if value is None:
return None
return bool(value)
def linestring_to_wkt(coords):
return "(" + ", ".join(f"{lon} {lat}" for lon, lat in coords) + ")"
def feature_to_multilinestring_wkt(geometry):
gtype = geometry.get("type")
coords = geometry.get("coordinates") or []
if gtype == "MultiLineString":
parts = [linestring_to_wkt(line) for line in coords if len(line) >= 2]
elif gtype == "LineString":
parts = [linestring_to_wkt(coords)] if len(coords) >= 2 else []
else:
return None
if not parts:
return None
return "MULTILINESTRING(" + ", ".join(parts) + ")"
def create_cable_tables(cur):
cur.execute(
f"""
create table {CABLES_TABLE} (
feature_id text primary key,
cable_id text,
name text,
color text,
owners text,
rfs_year integer,
decommission_year integer,
length_raw text,
length_km numeric,
cable_type text,
url text,
extra_urls jsonb,
properties jsonb,
geom geometry(MultiLineString, 4326)
)
"""
)
cur.execute(
f"create index internet_cables_geom_gix on {CABLES_TABLE} using gist (geom)"
)
cur.execute(
f"create index internet_cables_cable_id_idx on {CABLES_TABLE} (cable_id)"
)
cur.execute(
f"create index internet_cables_rfs_year_idx on {CABLES_TABLE} (rfs_year)"
)
cur.execute(
f"""
create table {LANDINGS_TABLE} (
feature_id text references {CABLES_TABLE}(feature_id) on delete cascade,
ordinal integer,
landing_id text,
name text,
country text,
is_tbd boolean,
primary key (feature_id, ordinal)
)
"""
)
cur.execute(
f"create index internet_cable_landings_landing_id_idx on {LANDINGS_TABLE} (landing_id)"
)
cur.execute(
f"create index internet_cable_landings_country_idx on {LANDINGS_TABLE} (country)"
)
def create_city_table(cur):
cur.execute(
f"""
create table {CITY_TABLE} (
id text primary key,
city text,
country text,
country_name text,
region text,
status text,
physical_capacity_tbps numeric,
added_physical_capacity_tbps numeric,
logical_dominance_ips bigint,
top_asns jsonb,
longitude double precision,
latitude double precision,
geom geometry(Point, 4326) generated always as
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
)
"""
)
cur.execute(
f"create index internet_city_dominance_geom_gix on {CITY_TABLE} using gist (geom)"
)
cur.execute(
f"create index internet_city_dominance_country_idx on {CITY_TABLE} (country)"
)
def create_year_table(cur):
cur.execute(
f"""
create table {YEAR_TABLE} (
year integer primary key,
description text
)
"""
)
def create_meta_table(cur):
cur.execute(
f"""
create table {META_TABLE} (
key text primary key,
value text
)
"""
)
def load_cables(cur, path):
with open(path, encoding="utf-8") as fh:
features = json.load(fh)
cable_rows = []
landing_rows = []
used_feature_ids = set()
for idx, feature in enumerate(features):
props = feature.get("properties") or {}
feature_id = props.get("feature_id") or props.get("id")
if not feature_id:
feature_id = f"legacy-{idx}"
# Disambiguate any residual collisions
base = feature_id
suffix = 1
while feature_id in used_feature_ids:
feature_id = f"{base}-{suffix}"
suffix += 1
used_feature_ids.add(feature_id)
# length may also live in a top-level lengthKm field on legacy entries
length_raw = props.get("length")
length_km = parse_length_km(length_raw)
if length_km is None and feature.get("lengthKm") is not None:
try:
length_km = Decimal(str(feature["lengthKm"]))
except Exception:
pass
wkt = feature_to_multilinestring_wkt(feature.get("geometry") or {})
cable_rows.append(
(
feature_id,
props.get("id"),
props.get("name"),
props.get("color"),
props.get("owners"),
to_int(props.get("rfs_year")),
to_int(props.get("decommission_year")),
length_raw,
length_km,
props.get("type"),
props.get("url"),
Json(props.get("extraUrls") or []),
Json(props),
wkt,
)
)
for ordinal, lp in enumerate(props.get("landing_points") or []):
landing_rows.append(
(
feature_id,
ordinal,
lp.get("id") or None,
lp.get("name"),
lp.get("country"),
to_bool(lp.get("is_tbd")),
)
)
execute_values(
cur,
f"""
insert into {CABLES_TABLE} (
feature_id, cable_id, name, color, owners, rfs_year, decommission_year,
length_raw, length_km, cable_type, url, extra_urls, properties, geom
) values %s
""",
cable_rows,
template=(
"(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
"ST_GeomFromText(%s, 4326))"
),
page_size=200,
)
execute_values(
cur,
f"""
insert into {LANDINGS_TABLE} (feature_id, ordinal, landing_id, name, country, is_tbd)
values %s
""",
landing_rows,
page_size=500,
)
return len(cable_rows), len(landing_rows)
def load_city_dominance(cur, path):
with open(path, encoding="utf-8") as fh:
items = json.load(fh)
rows = []
seen = set()
for item in items:
item_id = item.get("id")
if not item_id or item_id in seen:
continue
seen.add(item_id)
coords = item.get("coordinates") or [None, None]
lon, lat = (coords + [None, None])[:2]
rows.append(
(
item_id,
item.get("city"),
item.get("country"),
item.get("country_name"),
item.get("region"),
item.get("status"),
item.get("physical_capacity_tbps"),
item.get("added_physical_capacity_tbps"),
item.get("logical_dominance_ips"),
Json(item.get("top_asns") or []),
lon,
lat,
)
)
execute_values(
cur,
f"""
insert into {CITY_TABLE} (
id, city, country, country_name, region, status,
physical_capacity_tbps, added_physical_capacity_tbps,
logical_dominance_ips, top_asns, longitude, latitude
) values %s
""",
rows,
page_size=500,
)
return len(rows)
def load_year_summaries(cur, path):
with open(path, encoding="utf-8") as fh:
data = json.load(fh)
rows = []
for year_key, value in data.items():
year = to_int(year_key)
if year is None:
continue
description = value.get("description") if isinstance(value, dict) else str(value)
rows.append((year, description))
execute_values(
cur,
f"insert into {YEAR_TABLE} (year, description) values %s",
rows,
page_size=200,
)
return len(rows)
def load_meta(cur, path):
with open(path, encoding="utf-8") as fh:
data = json.load(fh)
rows = [(str(k), str(v)) for k, v in data.items()]
execute_values(
cur,
f"insert into {META_TABLE} (key, value) values %s",
rows,
)
return len(rows)
def parse_args():
parser = argparse.ArgumentParser(
description="Load internet_cables/*.json into PostGIS."
)
parser.add_argument(
"--data-dir",
default=DATA_DIR,
help=f"Directory containing the JSON files (default: {DATA_DIR})",
)
parser.add_argument(
"--replace",
action="store_true",
help="Drop existing target tables before loading.",
)
return parser.parse_args()
def main():
args = parse_args()
cables_path = os.path.join(args.data_dir, "all_cables.json")
city_path = os.path.join(args.data_dir, "city_dominance_2026.json")
year_path = os.path.join(args.data_dir, "year-summaries.json")
meta_path = os.path.join(args.data_dir, "meta.json")
for path in [cables_path, city_path, year_path, meta_path]:
if not os.path.exists(path):
raise FileNotFoundError(path)
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
if args.replace:
cur.execute(
f"drop table if exists {LANDINGS_TABLE}, {CABLES_TABLE}, "
f"{CITY_TABLE}, {YEAR_TABLE}, {META_TABLE} cascade"
)
for table, creator in [
(CABLES_TABLE, lambda c: create_cable_tables(c)),
(CITY_TABLE, create_city_table),
(YEAR_TABLE, create_year_table),
(META_TABLE, create_meta_table),
]:
cur.execute("select to_regclass(%s)", (table,))
if cur.fetchone()[0] is not None:
raise RuntimeError(
f"Target table {table} already exists; rerun with --replace to overwrite."
)
creator(cur)
cable_count, landing_count = load_cables(cur, cables_path)
city_count = load_city_dominance(cur, city_path)
year_count = load_year_summaries(cur, year_path)
meta_count = load_meta(cur, meta_path)
for table in [CABLES_TABLE, LANDINGS_TABLE, CITY_TABLE, YEAR_TABLE, META_TABLE]:
cur.execute(f"analyze {table}")
finally:
conn.close()
print(
f"loaded {cable_count} cables, {landing_count} landing points, "
f"{city_count} city-dominance points, {year_count} year summaries, "
f"{meta_count} meta rows."
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
Fetch US data centers from OpenStreetMap (Overpass API) and load them into
public.osm_data_centers in the data_centers database. Also (re)creates a
unioned view public.data_centers_union combining OSM + curated rows from
public.us_dc_sample_geocoded.
Two Overpass passes are made because tagging is inconsistent:
1) telecom=data_center
2) building=data_center
Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
in match_tags so we can see which query found each feature.
"""
import argparse
import json
import os
import sys
import time
from typing import Dict, List, Optional, Tuple
import psycopg2
import requests
from psycopg2.extras import Json, execute_values
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
TABLE = "public.osm_data_centers"
VIEW = "public.data_centers_union"
CURATED_TABLE = "public.us_dc_sample_geocoded"
DB_NAME = "data_centers"
# Tag passes: (key, value)
TAG_PASSES = [
("telecom", "data_center"),
("building", "data_center"),
]
def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
return f"""
[out:json][timeout:{timeout}];
area["ISO3166-1"="US"][admin_level=2]->.us;
(
node["{tag_key}"="{tag_value}"](area.us);
way["{tag_key}"="{tag_value}"](area.us);
relation["{tag_key}"="{tag_value}"](area.us);
);
out center tags;
""".strip()
def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
if cache_path and os.path.exists(cache_path):
print(f" using cached response: {cache_path}")
with open(cache_path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
else:
query = overpass_query(tag_key, tag_value)
print(f" querying Overpass for {tag_key}={tag_value} ...")
headers = {
"User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
"Accept": "application/json",
}
resp = requests.post(
OVERPASS_URL,
data={"data": query},
headers=headers,
timeout=240,
)
if resp.status_code != 200:
print(f" Overpass returned {resp.status_code}: {resp.text[:500]}")
resp.raise_for_status()
payload = resp.json()
if cache_path:
with open(cache_path, "w", encoding="utf-8") as fh:
json.dump(payload, fh)
print(f" cached to {cache_path}")
elements = payload.get("elements", [])
print(f" pass returned {len(elements)} elements")
return elements
def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
if elem.get("type") == "node":
return elem.get("lon"), elem.get("lat")
center = elem.get("center") or {}
return center.get("lon"), center.get("lat")
def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
lon, lat = element_coords(elem)
if lon is None or lat is None:
return None
osm_type = elem.get("type")
osm_id = elem.get("id")
if osm_type is None or osm_id is None:
return None
tags = elem.get("tags") or {}
return {
"id": f"{osm_type}/{osm_id}",
"osm_type": osm_type,
"osm_id": int(osm_id),
"name": tags.get("name"),
"operator": tags.get("operator"),
"operator_type": tags.get("operator:type"),
"telecom": tags.get("telecom"),
"building": tags.get("building"),
"power": tags.get("power"),
"website": tags.get("website") or tags.get("contact:website"),
"phone": tags.get("phone") or tags.get("contact:phone"),
"street_address": " ".join(
part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
) or None,
"city": tags.get("addr:city"),
"state": tags.get("addr:state"),
"postal_code": tags.get("addr:postcode"),
"country": tags.get("addr:country") or "US",
"matched_tags": [matched_tag],
"tags": tags,
"longitude": float(lon),
"latitude": float(lat),
}
def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
for row in new_rows:
key = row["id"]
prior = existing.get(key)
if prior is None:
existing[key] = row
continue
# merge matched_tags; keep first non-null values for other fields
merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
prior["matched_tags"] = merged_tags
for col, val in row.items():
if col == "matched_tags":
continue
if prior.get(col) in (None, "") and val not in (None, ""):
prior[col] = val
COLUMNS = [
"id",
"osm_type",
"osm_id",
"name",
"operator",
"operator_type",
"telecom",
"building",
"power",
"website",
"phone",
"street_address",
"city",
"state",
"postal_code",
"country",
"matched_tags",
"tags",
"longitude",
"latitude",
]
def row_to_tuple(row: dict) -> tuple:
return (
row["id"],
row["osm_type"],
row["osm_id"],
row.get("name"),
row.get("operator"),
row.get("operator_type"),
row.get("telecom"),
row.get("building"),
row.get("power"),
row.get("website"),
row.get("phone"),
row.get("street_address"),
row.get("city"),
row.get("state"),
row.get("postal_code"),
row.get("country"),
row.get("matched_tags", []),
Json(row.get("tags", {})),
row["longitude"],
row["latitude"],
)
def create_table(cur) -> None:
cur.execute(
f"""
create table {TABLE} (
id text primary key,
osm_type text not null,
osm_id bigint not null,
name text,
operator text,
operator_type text,
telecom text,
building text,
power text,
website text,
phone text,
street_address text,
city text,
state text,
postal_code text,
country text,
matched_tags text[] not null default '{{}}',
tags jsonb not null default '{{}}'::jsonb,
longitude double precision not null,
latitude double precision not null,
ingested_at timestamptz not null default now(),
geom geometry(Point, 4326) generated always as
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
)
"""
)
cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
def insert_values(cur, rows: List[dict], upsert: bool) -> None:
sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
if upsert:
update_cols = [c for c in COLUMNS if c != "id"]
assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
sql += (
f" on conflict (id) do update set {assignments}, "
f"ingested_at = now()"
)
execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
def create_or_replace_view(cur) -> None:
cur.execute(
f"""
create or replace view {VIEW} as
select
'curated/' || id as id,
'curated'::text as source,
facility_name as name,
provider as operator,
street_address,
city,
state_code as state,
postal_code,
country,
url as website,
phone,
longitude,
latitude,
geom
from {CURATED_TABLE}
union all
select
id,
'osm'::text as source,
name,
operator,
street_address,
city,
state,
postal_code,
country,
website,
phone,
longitude,
latitude,
geom
from {TABLE}
"""
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--cache-dir",
default="output",
help="Directory to cache raw Overpass responses (default: output/).",
)
parser.add_argument(
"--no-cache",
action="store_true",
help="Do not read or write Overpass cache files; always hit the API.",
)
parser.add_argument(
"--recreate",
action="store_true",
help=f"Drop and recreate {TABLE} before loading.",
)
parser.add_argument(
"--upsert",
action="store_true",
default=True,
help="On id conflicts, update the existing row (default: on).",
)
parser.add_argument(
"--skip-view",
action="store_true",
help=f"Do not create/replace the unioned view {VIEW}.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
os.makedirs(args.cache_dir, exist_ok=True)
merged: Dict[str, dict] = {}
for tag_key, tag_value in TAG_PASSES:
cache_path = (
None
if args.no_cache
else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
)
print(f"Pass: {tag_key}={tag_value}")
elements = fetch_pass(tag_key, tag_value, cache_path)
normalized = [
row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
if row is not None
]
print(f" normalized {len(normalized)} rows with coords")
merge_records(merged, normalized)
# be polite to Overpass between passes
time.sleep(2)
rows = list(merged.values())
print(f"Total deduped OSM data-center features: {len(rows)}")
if not rows:
print("No rows fetched; aborting DB load.", file=sys.stderr)
return 1
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
if args.recreate:
cur.execute(f"drop table if exists {TABLE} cascade")
cur.execute("select to_regclass(%s)", (TABLE,))
if cur.fetchone()[0] is None:
create_table(cur)
insert_values(cur, rows, upsert=args.upsert)
cur.execute(f"analyze {TABLE}")
if not args.skip_view:
cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
if cur.fetchone()[0] is not None:
create_or_replace_view(cur)
print(f"View {VIEW} (re)created.")
else:
print(
f"Skipping view: {CURATED_TABLE} does not exist.",
file=sys.stderr,
)
cur.execute(f"select count(*) from {TABLE}")
total = cur.fetchone()[0]
finally:
conn.close()
print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,245 @@
#!/usr/bin/env python3
import argparse
import json
import os
from collections import Counter
import psycopg2
DB_NAME = "data_centers"
POINT_TABLE = "public.master_data_centers"
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
def load_points(conn):
with conn.cursor() as cur:
cur.execute(
f"""
select
master_id,
source,
coalesce(operator, '') as operator,
coalesce(name, '') as name,
coalesce(city, '') as city,
coalesce(state, '') as state,
longitude,
latitude,
coalesce(curated_id, '') as curated_id,
coalesce(osm_id, '') as osm_id,
coalesce(match_method, '') as match_method,
coalesce(geoid, '') as geoid
from {POINT_TABLE}
where longitude is not null and latitude is not null
"""
)
rows = cur.fetchall()
points = []
for row in rows:
points.append(
{
"id": row[0],
"source": row[1],
"operator": row[2],
"name": row[3],
"city": row[4],
"state": row[5],
"lon": float(row[6]),
"lat": float(row[7]),
"curated_id": row[8],
"osm_id": row[9],
"match_method": row[10],
"geoid": row[11],
}
)
return points
def compute_center(points):
if not points:
return 39.5, -98.35
lat = sum(p["lat"] for p in points) / len(points)
lon = sum(p["lon"] for p in points) / len(points)
return lat, lon
def build_stats(points):
by_source = Counter(p["source"] or "(blank)" for p in points)
by_match = Counter(p["match_method"] or "(none)" for p in points)
return {
"total": len(points),
"by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
"by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
}
def render_html(points, center_lat, center_lon, output_path):
stats = build_stats(points)
points_json = json.dumps(points)
stats_json = json.dumps(stats)
html = f"""<!doctype html>
<html lang=\"en\">
<head>
<meta charset=\"utf-8\" />
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
<title>US Data Centers Master Map</title>
<link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
<style>
html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
#layout {{ display: grid; grid-template-columns: 320px 1fr; height: 100%; }}
#panel {{ padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; }}
#map {{ height: 100%; width: 100%; }}
h1 {{ margin: 0 0 8px; font-size: 18px; }}
h2 {{ margin: 16px 0 8px; font-size: 14px; }}
.stat-row {{ display: flex; justify-content: space-between; padding: 2px 0; font-size: 13px; }}
.dot {{ width: 10px; height: 10px; border-radius: 50%; display: inline-block; margin-right: 8px; }}
@media (max-width: 900px) {{
#layout {{ grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }}
#panel {{ border-right: 0; border-bottom: 1px solid #ddd; }}
}}
</style>
</head>
<body>
<div id=\"layout\">
<div id=\"panel\">
<h1>US Data Centers (Master)</h1>
<div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
<h2>Source</h2>
<div id=\"sourceStats\"></div>
<h2>Match Method (merged rows)</h2>
<div id=\"matchStats\"></div>
<h2>Source Colors</h2>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
</div>
<div id=\"map\"></div>
</div>
<script src=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.js\"></script>
<script>
const points = {points_json};
const stats = {stats_json};
function colorForSource(source) {{
if (source === 'merged') return '#2ca02c';
if (source === 'curated') return '#1f77b4';
if (source === 'osm') return '#ff7f0e';
return '#7f7f7f';
}}
function escapeHtml(value) {{
return String(value || '')
.replaceAll('&', '&amp;')
.replaceAll('<', '&lt;')
.replaceAll('>', '&gt;')
.replaceAll('"', '&quot;')
.replaceAll("'", '&#39;');
}}
const map = L.map('map', {{ preferCanvas: true }}).setView([{center_lat}, {center_lon}], 5);
L.tileLayer('https://tile.openstreetmap.org/{{z}}/{{x}}/{{y}}.png', {{
maxZoom: 19,
attribution: '&copy; OpenStreetMap contributors'
}}).addTo(map);
const bounds = [];
for (const p of points) {{
const marker = L.circleMarker([p.lat, p.lon], {{
radius: 4,
color: colorForSource(p.source),
fillColor: colorForSource(p.source),
fillOpacity: 0.7,
weight: 1
}});
const title = p.name || p.id;
const operator = p.operator || '(unknown operator)';
const cityState = [p.city, p.state].filter(Boolean).join(', ');
const provenance = [
p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
].filter(Boolean).join('<br>');
marker.bindPopup(`
<strong>${{escapeHtml(title)}}</strong><br>
Operator: ${{escapeHtml(operator)}}<br>
Location: ${{escapeHtml(cityState)}}<br>
Source: ${{escapeHtml(p.source)}}<br>
${{provenance ? provenance + '<br>' : ''}}
GEOID: ${{escapeHtml(p.geoid)}}
`);
marker.addTo(map);
bounds.push([p.lat, p.lon]);
}}
if (bounds.length > 0) {{
map.fitBounds(bounds, {{ padding: [20, 20] }});
}}
document.getElementById('total').textContent = stats.total;
const sourceStats = document.getElementById('sourceStats');
for (const [k, v] of Object.entries(stats.by_source)) {{
const div = document.createElement('div');
div.className = 'stat-row';
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
sourceStats.appendChild(div);
}}
const matchStats = document.getElementById('matchStats');
for (const [k, v] of Object.entries(stats.by_match_method)) {{
const div = document.createElement('div');
div.className = 'stat-row';
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
matchStats.appendChild(div);
}}
</script>
</body>
</html>
"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(html)
def parse_args():
parser = argparse.ArgumentParser(
description="Generate an interactive HTML map from the PostGIS point table."
)
parser.add_argument(
"--output",
default="data_center_map.html",
help="Output HTML path (default: data_center_map.html)",
)
return parser.parse_args()
def main():
args = parse_args()
conn = connect()
try:
points = load_points(conn)
finally:
conn.close()
center_lat, center_lon = compute_center(points)
render_html(points, center_lat, center_lon, args.output)
print(f"wrote {len(points)} points to {args.output}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,338 @@
#!/usr/bin/env python3
"""Render a Leaflet HTML map combining US data centers, submarine cables,
and city-level network-dominance points from PostGIS.
"""
import argparse
import json
import os
import psycopg2
DB_NAME = "data_centers"
DC_TABLE = "public.master_data_centers"
CABLES_TABLE = "public.internet_cables"
CITY_TABLE = "public.internet_city_dominance"
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
def load_data_centers(conn):
with conn.cursor() as cur:
cur.execute(
f"""
select
master_id,
source,
coalesce(operator, ''),
coalesce(name, ''),
coalesce(city, ''),
coalesce(state, ''),
longitude,
latitude
from {DC_TABLE}
where longitude is not null and latitude is not null
"""
)
return [
{
"id": r[0],
"source": r[1],
"operator": r[2],
"name": r[3],
"city": r[4],
"state": r[5],
"lon": float(r[6]),
"lat": float(r[7]),
}
for r in cur.fetchall()
]
def load_cables(conn):
with conn.cursor() as cur:
cur.execute(
f"""
select
feature_id,
coalesce(cable_id, ''),
coalesce(name, ''),
coalesce(color, '#888888'),
coalesce(owners, ''),
rfs_year,
decommission_year,
length_km,
coalesce(url, ''),
ST_AsGeoJSON(geom)
from {CABLES_TABLE}
where geom is not null
"""
)
features = []
for r in cur.fetchall():
features.append(
{
"type": "Feature",
"geometry": json.loads(r[9]),
"properties": {
"feature_id": r[0],
"cable_id": r[1],
"name": r[2],
"color": r[3],
"owners": r[4],
"rfs_year": r[5],
"decommission_year": r[6],
"length_km": float(r[7]) if r[7] is not None else None,
"url": r[8],
},
}
)
return {"type": "FeatureCollection", "features": features}
def load_cities(conn, us_only=False):
where = "where geom is not null"
if us_only:
where += " and country = 'US'"
with conn.cursor() as cur:
cur.execute(
f"""
select
id,
coalesce(city, ''),
coalesce(country, ''),
coalesce(country_name, ''),
coalesce(region, ''),
physical_capacity_tbps,
logical_dominance_ips,
longitude,
latitude
from {CITY_TABLE}
{where}
"""
)
return [
{
"id": r[0],
"city": r[1],
"country": r[2],
"country_name": r[3],
"region": r[4],
"tbps": float(r[5]) if r[5] is not None else None,
"ips": int(r[6]) if r[6] is not None else None,
"lon": float(r[7]),
"lat": float(r[8]),
}
for r in cur.fetchall()
]
def render_html(data_centers, cables_geojson, cities, output_path):
payload = json.dumps(
{
"data_centers": data_centers,
"cables": cables_geojson,
"cities": cities,
}
)
html = """<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>US Data Centers + Submarine Cables</title>
<link rel="stylesheet" href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css" />
<style>
html, body { height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }
#layout { display: grid; grid-template-columns: 300px 1fr; height: 100%; }
#panel { padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; font-size: 13px; }
#map { height: 100%; width: 100%; }
h1 { margin: 0 0 8px; font-size: 18px; }
h2 { margin: 14px 0 6px; font-size: 13px; text-transform: uppercase; color: #555; letter-spacing: 0.04em; }
.row { display: flex; justify-content: space-between; padding: 2px 0; }
.swatch { width: 12px; height: 12px; display: inline-block; margin-right: 8px; vertical-align: middle; border: 1px solid #ccc; }
label.toggle { display: block; padding: 3px 0; cursor: pointer; }
@media (max-width: 900px) {
#layout { grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }
#panel { border-right: 0; border-bottom: 1px solid #ddd; }
}
</style>
</head>
<body>
<div id="layout">
<div id="panel">
<h1>Data Centers + Cables</h1>
<div class="row"><span>Data centers</span><strong id="dcCount"></strong></div>
<div class="row"><span>Submarine cables</span><strong id="cableCount"></strong></div>
<div class="row"><span>City dominance pts</span><strong id="cityCount"></strong></div>
<h2>Layers</h2>
<label class="toggle"><input type="checkbox" id="tDc" checked> Data centers</label>
<label class="toggle"><input type="checkbox" id="tCables" checked> Submarine cables</label>
<label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
<h2>Data center source</h2>
<div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
<div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
<div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
<div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
<h2>City dominance</h2>
<div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
</div>
<div id="map"></div>
</div>
<script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js"></script>
<script>
const DATA = __PAYLOAD__;
function colorForSource(source) {
if (source === 'merged') return '#2ca02c';
if (source === 'curated') return '#1f77b4';
if (source === 'osm') return '#ff7f0e';
return '#7f7f7f';
}
function esc(v) {
return String(v == null ? '' : v)
.replaceAll('&','&amp;').replaceAll('<','&lt;').replaceAll('>','&gt;')
.replaceAll('"','&quot;').replaceAll("'", '&#39;');
}
const map = L.map('map', { preferCanvas: true, worldCopyJump: true }).setView([20, -40], 3);
L.tileLayer('https://tile.openstreetmap.org/{z}/{x}/{y}.png', {
maxZoom: 19,
attribution: '&copy; OpenStreetMap contributors'
}).addTo(map);
const cableLayer = L.geoJSON(DATA.cables, {
style: f => ({
color: f.properties.color || '#888',
weight: 1.4,
opacity: 0.75,
}),
onEachFeature: (feature, layer) => {
const p = feature.properties;
const yrs = [p.rfs_year, p.decommission_year].filter(Boolean).join(' ');
layer.bindPopup(`
<strong>${esc(p.name)}</strong><br>
${p.url ? `<a href="${esc(p.url)}" target="_blank" rel="noopener">${esc(p.url)}</a><br>` : ''}
Owners: ${esc(p.owners)}<br>
${yrs ? `Years: ${esc(yrs)}<br>` : ''}
${p.length_km ? `Length: ${esc(p.length_km.toLocaleString())} km<br>` : ''}
ID: ${esc(p.cable_id || p.feature_id)}
`);
},
}).addTo(map);
const cityLayer = L.layerGroup();
for (const c of DATA.cities) {
const tbps = c.tbps || 0;
const radius = Math.max(2, Math.min(18, Math.sqrt(tbps) * 1.6));
const m = L.circleMarker([c.lat, c.lon], {
radius,
color: '#6c2a86',
fillColor: '#9b59b6',
fillOpacity: 0.45,
weight: 0.8,
});
m.bindPopup(`
<strong>${esc(c.city)}</strong> (${esc(c.country)})<br>
Region: ${esc(c.region)}<br>
Physical capacity: ${esc(tbps.toFixed ? tbps.toFixed(2) : tbps)} Tbps<br>
Logical dominance IPs: ${esc(c.ips ? c.ips.toLocaleString() : '')}
`);
cityLayer.addLayer(m);
}
cityLayer.addTo(map);
const dcLayer = L.layerGroup();
const dcBounds = [];
for (const p of DATA.data_centers) {
const m = L.circleMarker([p.lat, p.lon], {
radius: 3,
color: colorForSource(p.source),
fillColor: colorForSource(p.source),
fillOpacity: 0.85,
weight: 0.8,
});
const title = p.name || p.id;
const operator = p.operator || '(unknown operator)';
const cityState = [p.city, p.state].filter(Boolean).join(', ');
m.bindPopup(`
<strong>${esc(title)}</strong><br>
Operator: ${esc(operator)}<br>
Location: ${esc(cityState)}<br>
Source: ${esc(p.source)}
`);
dcLayer.addLayer(m);
dcBounds.push([p.lat, p.lon]);
}
dcLayer.addTo(map);
if (dcBounds.length) map.fitBounds(dcBounds, { padding: [30, 30], maxZoom: 5 });
function toggle(layer, on) {
if (on) { if (!map.hasLayer(layer)) layer.addTo(map); }
else { if (map.hasLayer(layer)) map.removeLayer(layer); }
}
document.getElementById('tDc').addEventListener('change', e => toggle(dcLayer, e.target.checked));
document.getElementById('tCables').addEventListener('change', e => toggle(cableLayer, e.target.checked));
document.getElementById('tCities').addEventListener('change', e => toggle(cityLayer, e.target.checked));
document.getElementById('dcCount').textContent = DATA.data_centers.length.toLocaleString();
document.getElementById('cableCount').textContent = DATA.cables.features.length.toLocaleString();
document.getElementById('cityCount').textContent = DATA.cities.length.toLocaleString();
</script>
</body>
</html>
"""
html = html.replace("__PAYLOAD__", payload)
with open(output_path, "w", encoding="utf-8") as f:
f.write(html)
def parse_args():
p = argparse.ArgumentParser(
description="Render a Leaflet map combining data centers, submarine cables, and city dominance."
)
p.add_argument("--output", default="data_centers_cables_map.html")
p.add_argument(
"--us-cities-only",
action="store_true",
help="Restrict the city-dominance layer to country='US'.",
)
return p.parse_args()
def main():
args = parse_args()
conn = connect()
try:
dcs = load_data_centers(conn)
cables = load_cables(conn)
cities = load_cities(conn, us_only=args.us_cities_only)
finally:
conn.close()
render_html(dcs, cables, cities, args.output)
print(
f"wrote {len(dcs)} data centers, "
f"{len(cables['features'])} cables, "
f"{len(cities)} city points -> {args.output}"
)
if __name__ == "__main__":
main()