Reorganize project into scripts/, docs/, data/, output/ directories
Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
236
scripts/analyze_cables_concentration.py
Normal file
236
scripts/analyze_cables_concentration.py
Normal file
@@ -0,0 +1,236 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Quick statistical analysis: are US data centers spatially tied to submarine
|
||||
cables, and does the resulting pattern look like concentrated costs / dispersed
|
||||
benefits?
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
import statistics
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname="data_centers",
|
||||
)
|
||||
|
||||
|
||||
def quantiles(xs, qs=(0.1, 0.25, 0.5, 0.75, 0.9)):
|
||||
s = sorted(xs)
|
||||
n = len(s)
|
||||
out = {}
|
||||
for q in qs:
|
||||
if n == 0:
|
||||
out[q] = None
|
||||
continue
|
||||
k = (n - 1) * q
|
||||
lo, hi = math.floor(k), math.ceil(k)
|
||||
if lo == hi:
|
||||
out[q] = s[int(k)]
|
||||
else:
|
||||
out[q] = s[lo] + (s[hi] - s[lo]) * (k - lo)
|
||||
return out
|
||||
|
||||
|
||||
def gini(values):
|
||||
"""Standard Gini coefficient for non-negative values."""
|
||||
v = sorted(x for x in values if x is not None and x >= 0)
|
||||
n = len(v)
|
||||
if n == 0 or sum(v) == 0:
|
||||
return None
|
||||
cum = 0.0
|
||||
for i, x in enumerate(v, 1):
|
||||
cum += i * x
|
||||
return (2 * cum) / (n * sum(v)) - (n + 1) / n
|
||||
|
||||
|
||||
def hhi(shares):
|
||||
"""Herfindahl-Hirschman Index on shares that sum to ~1. Returns value in [0, 1]."""
|
||||
return sum(s * s for s in shares)
|
||||
|
||||
|
||||
def mann_whitney_u_z(xs, ys):
|
||||
"""Approximate Mann-Whitney U test z-score (normal approx, large-n).
|
||||
Returns (U, z, p_two_sided). Uses average ranks for ties.
|
||||
"""
|
||||
combined = [(v, 0) for v in xs] + [(v, 1) for v in ys]
|
||||
combined.sort(key=lambda t: t[0])
|
||||
ranks = [0.0] * len(combined)
|
||||
i = 0
|
||||
n = len(combined)
|
||||
while i < n:
|
||||
j = i
|
||||
while j + 1 < n and combined[j + 1][0] == combined[i][0]:
|
||||
j += 1
|
||||
avg_rank = (i + j) / 2 + 1
|
||||
for k in range(i, j + 1):
|
||||
ranks[k] = avg_rank
|
||||
i = j + 1
|
||||
r1 = sum(r for r, (_, g) in zip(ranks, combined) if g == 0)
|
||||
n1, n2 = len(xs), len(ys)
|
||||
U1 = r1 - n1 * (n1 + 1) / 2
|
||||
mu = n1 * n2 / 2
|
||||
sigma = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
|
||||
z = (U1 - mu) / sigma if sigma else 0.0
|
||||
# Two-sided p via error function
|
||||
p = math.erfc(abs(z) / math.sqrt(2))
|
||||
return U1, z, p
|
||||
|
||||
|
||||
def main():
|
||||
conn = connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# --- 1. Distance from each US data center to nearest submarine cable ---
|
||||
cur.execute(
|
||||
"""
|
||||
with cables_union as (
|
||||
select ST_Union(geom)::geography as g from public.internet_cables
|
||||
)
|
||||
select ST_Distance(
|
||||
ST_SetSRID(ST_MakePoint(dc.longitude, dc.latitude), 4326)::geography,
|
||||
cu.g
|
||||
) / 1000.0 -- meters -> km
|
||||
from public.us_dc_sample_geocoded dc, cables_union cu
|
||||
where dc.longitude is not null and dc.latitude is not null
|
||||
and (dc.country = 'United States' or dc.country is null)
|
||||
"""
|
||||
)
|
||||
dc_km = [float(r[0]) for r in cur.fetchall()]
|
||||
|
||||
# --- 2. Distance from US city-dominance points to nearest cable ---
|
||||
cur.execute(
|
||||
"""
|
||||
with cables_union as (
|
||||
select ST_Union(geom)::geography as g from public.internet_cables
|
||||
)
|
||||
select ST_Distance(
|
||||
ST_SetSRID(ST_MakePoint(c.longitude, c.latitude), 4326)::geography,
|
||||
cu.g
|
||||
) / 1000.0
|
||||
from public.internet_city_dominance c, cables_union cu
|
||||
where c.country = 'US' and c.geom is not null
|
||||
"""
|
||||
)
|
||||
city_km = [float(r[0]) for r in cur.fetchall()]
|
||||
|
||||
# --- 3. DC distribution by state (cost concentration) ---
|
||||
cur.execute(
|
||||
"""
|
||||
select coalesce(nullif(state_code, ''), 'UNK') as st, count(*)
|
||||
from public.us_dc_sample_geocoded
|
||||
where longitude is not null and latitude is not null
|
||||
group by 1
|
||||
"""
|
||||
)
|
||||
state_counts = dict(cur.fetchall())
|
||||
total_dc = sum(state_counts.values())
|
||||
state_shares = {k: v / total_dc for k, v in state_counts.items()}
|
||||
|
||||
# --- 4. IP distribution across US cities (benefit dispersion proxy) ---
|
||||
cur.execute(
|
||||
"""
|
||||
select city, coalesce(logical_dominance_ips, 0)
|
||||
from public.internet_city_dominance
|
||||
where country = 'US' and logical_dominance_ips is not null
|
||||
"""
|
||||
)
|
||||
city_ips = [(r[0], int(r[1])) for r in cur.fetchall()]
|
||||
total_ips = sum(v for _, v in city_ips)
|
||||
ip_shares = [v / total_ips for _, v in city_ips] if total_ips else []
|
||||
|
||||
# --- 5. Where do the people-with-IPs LIVE relative to the DCs? ---
|
||||
# Top-N US dominance cities, share of national IPs each captures.
|
||||
top_ip_cities = sorted(city_ips, key=lambda t: -t[1])[:10]
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# ======= report =======
|
||||
print("=" * 70)
|
||||
print("ARE US DATA CENTERS TIED TO SUBMARINE CABLE LANDINGS?")
|
||||
print("=" * 70)
|
||||
print(f"\nN data centers analyzed: {len(dc_km):,}")
|
||||
print(f"N US city-dominance pts: {len(city_km):,}")
|
||||
|
||||
def fmt_q(label, xs):
|
||||
q = quantiles(xs)
|
||||
print(f"\n{label}:")
|
||||
print(f" mean = {statistics.mean(xs):,.1f} km")
|
||||
print(f" median (p50) = {q[0.5]:,.1f} km")
|
||||
print(f" p10 / p25 / p75 / p90 = "
|
||||
f"{q[0.1]:,.1f} / {q[0.25]:,.1f} / {q[0.75]:,.1f} / {q[0.9]:,.1f} km")
|
||||
for thr in (10, 50, 100, 250):
|
||||
frac = sum(1 for x in xs if x <= thr) / len(xs)
|
||||
print(f" share within {thr:>3} km of a cable: {frac*100:5.1f}%")
|
||||
|
||||
fmt_q("Distance: US DATA CENTERS -> nearest submarine cable", dc_km)
|
||||
fmt_q("Distance: US POPULATION CITIES -> nearest submarine cable", city_km)
|
||||
|
||||
U, z, p = mann_whitney_u_z(dc_km, city_km)
|
||||
print(f"\nMann-Whitney U (DCs vs. cities, two-sided): U={U:,.0f}, z={z:.2f}, "
|
||||
f"p≈{p:.2e}")
|
||||
if statistics.median(dc_km) < statistics.median(city_km):
|
||||
diff = statistics.median(city_km) - statistics.median(dc_km)
|
||||
print(f" -> DC median is {diff:,.1f} km CLOSER to cables than city median.")
|
||||
else:
|
||||
print(" -> DCs are not closer to cables than cities.")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("CONCENTRATION OF COSTS (data centers by state)")
|
||||
print("=" * 70)
|
||||
g_dc = gini(list(state_counts.values()))
|
||||
h_dc = hhi(list(state_shares.values()))
|
||||
print(f"States covered: {len(state_counts)}")
|
||||
print(f"Gini of DC counts across states: {g_dc:.3f} (0=even, 1=one state takes all)")
|
||||
print(f"HHI of state shares: {h_dc:.3f} (0.18+ = highly concentrated)")
|
||||
top_states = sorted(state_shares.items(), key=lambda t: -t[1])[:8]
|
||||
cum = 0.0
|
||||
print(f"\nTop states by share of US data centers:")
|
||||
for st, s in top_states:
|
||||
cum += s
|
||||
print(f" {st}: {s*100:5.1f}% ({state_counts[st]:>4} DCs) cum={cum*100:5.1f}%")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("DISPERSION OF BENEFITS (US IPs across cities)")
|
||||
print("=" * 70)
|
||||
g_ip = gini([v for _, v in city_ips])
|
||||
h_ip = hhi(ip_shares)
|
||||
print(f"US cities with IP data: {len(city_ips):,}")
|
||||
print(f"Gini of IPs across cities: {g_ip:.3f}")
|
||||
print(f"HHI of IP shares: {h_ip:.3f}")
|
||||
cum = 0.0
|
||||
print(f"\nTop US cities by share of national IPs:")
|
||||
for city, ips in top_ip_cities:
|
||||
s = ips / total_ips
|
||||
cum += s
|
||||
print(f" {city:<30} {s*100:5.2f}% ({ips:>11,} IPs) cum={cum*100:5.2f}%")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("INTERPRETATION")
|
||||
print("=" * 70)
|
||||
print(f"""
|
||||
Cost concentration (DCs across states): Gini={g_dc:.3f} HHI={h_dc:.3f}
|
||||
Benefit dispersion (IPs across cities): Gini={g_ip:.3f} HHI={h_ip:.3f}
|
||||
|
||||
A "concentrated costs / dispersed benefits" pattern requires:
|
||||
(1) DCs cluster in a few places (high state-level Gini/HHI).
|
||||
(2) Users they serve span many places (low city-level Gini/HHI, ideally).
|
||||
(3) That clustering is plausibly tied to fixed infrastructure (cables).
|
||||
|
||||
Check signs above:
|
||||
- DC location vs cable proximity: see Mann-Whitney result.
|
||||
- Cost concentration: compare DC HHI to a "competitive" benchmark (~0.10).
|
||||
- Benefit dispersion: IP-share Gini >> DC-state Gini would corroborate
|
||||
the asymmetry (benefits more evenly distributed than costs).
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
320
scripts/analyze_dc_tract_concentration.py
Normal file
320
scripts/analyze_dc_tract_concentration.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tract-level analysis of the 'concentrated costs / dispersed benefits' frame
|
||||
for US data-center siting.
|
||||
|
||||
Cost-bearing universe = tracts that host at least one DC
|
||||
(public.data_center_census_tracts_2024)
|
||||
Comparison universe = ACS 2024 5-yr tracts in the selected states
|
||||
(census_tract_acs_2024_selected_states.csv)
|
||||
"""
|
||||
import csv
|
||||
import math
|
||||
import os
|
||||
import statistics
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
CSV_PATH = "census_tract_acs_2024_selected_states.csv"
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname="data_centers",
|
||||
)
|
||||
|
||||
|
||||
def gini(values):
|
||||
v = sorted(x for x in values if x is not None and x >= 0)
|
||||
n = len(v)
|
||||
if n == 0 or sum(v) == 0:
|
||||
return None
|
||||
cum = sum(i * x for i, x in enumerate(v, 1))
|
||||
return (2 * cum) / (n * sum(v)) - (n + 1) / n
|
||||
|
||||
|
||||
def hhi(shares):
|
||||
return sum(s * s for s in shares)
|
||||
|
||||
|
||||
def median(xs):
|
||||
xs = [x for x in xs if x is not None]
|
||||
return statistics.median(xs) if xs else None
|
||||
|
||||
|
||||
def mean(xs):
|
||||
xs = [x for x in xs if x is not None]
|
||||
return statistics.mean(xs) if xs else None
|
||||
|
||||
|
||||
def wmean(xs, ws):
|
||||
pairs = [(x, w) for x, w in zip(xs, ws) if x is not None and w is not None and w > 0]
|
||||
if not pairs:
|
||||
return None
|
||||
total = sum(w for _, w in pairs)
|
||||
return sum(x * w for x, w in pairs) / total
|
||||
|
||||
|
||||
def to_float(s):
|
||||
try:
|
||||
return float(s)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_int(s):
|
||||
try:
|
||||
return int(float(s))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
conn = connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# DC-hosting tracts (the cost-bearing universe) ----------------------
|
||||
cur.execute(
|
||||
"""
|
||||
select
|
||||
geoid,
|
||||
statefp,
|
||||
data_center_count,
|
||||
population,
|
||||
households,
|
||||
broadband_subscription_pct,
|
||||
median_household_income,
|
||||
per_capita_income,
|
||||
poverty_rate,
|
||||
non_hispanic_white_pct,
|
||||
non_hispanic_black_pct,
|
||||
hispanic_latino_pct,
|
||||
non_hispanic_asian_pct,
|
||||
primary_industry,
|
||||
land_area_sqm,
|
||||
industry_information_workers,
|
||||
industry_total_workers
|
||||
from public.data_center_census_tracts_2024
|
||||
"""
|
||||
)
|
||||
dc_tracts = []
|
||||
for r in cur.fetchall():
|
||||
dc_tracts.append(
|
||||
{
|
||||
"geoid": r[0],
|
||||
"statefp": r[1],
|
||||
"dc_count": r[2] or 0,
|
||||
"pop": r[3],
|
||||
"hh": r[4],
|
||||
"broadband_pct": float(r[5]) if r[5] is not None else None,
|
||||
"mhi": r[6],
|
||||
"pci": r[7],
|
||||
"poverty": float(r[8]) if r[8] is not None else None,
|
||||
"white_pct": float(r[9]) if r[9] is not None else None,
|
||||
"black_pct": float(r[10]) if r[10] is not None else None,
|
||||
"hisp_pct": float(r[11]) if r[11] is not None else None,
|
||||
"asian_pct": float(r[12]) if r[12] is not None else None,
|
||||
"primary_industry": r[13],
|
||||
"land_sqm": r[14],
|
||||
"info_workers": r[15],
|
||||
"total_workers": r[16],
|
||||
}
|
||||
)
|
||||
|
||||
# Distance from each DC tract to nearest cable (km) ----------------
|
||||
cur.execute(
|
||||
"""
|
||||
with cables as (select ST_Union(geom)::geography g from public.internet_cables)
|
||||
select t.geoid,
|
||||
ST_Distance(ST_Centroid(t.geom)::geography, c.g) / 1000.0
|
||||
from public.data_center_census_tracts_2024 t, cables c
|
||||
"""
|
||||
)
|
||||
dist_by_geoid = {r[0]: float(r[1]) for r in cur.fetchall()}
|
||||
for t in dc_tracts:
|
||||
t["dist_km"] = dist_by_geoid.get(t["geoid"])
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# Comparison universe from the wider ACS CSV ------------------------
|
||||
universe = []
|
||||
with open(CSV_PATH, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
universe.append(
|
||||
{
|
||||
"geoid": row["geoid"],
|
||||
"statefp": row["statefp"],
|
||||
"pop": to_int(row["population"]),
|
||||
"broadband_pct": to_float(row["broadband_subscription_pct"]),
|
||||
"mhi": to_int(row["median_household_income"]),
|
||||
"pci": to_int(row["per_capita_income"]),
|
||||
"poverty": to_float(row["poverty_rate"]),
|
||||
"white_pct": to_float(row["non_hispanic_white_pct"]),
|
||||
"black_pct": to_float(row["non_hispanic_black_pct"]),
|
||||
"hisp_pct": to_float(row["hispanic_latino_pct"]),
|
||||
"asian_pct": to_float(row["non_hispanic_asian_pct"]),
|
||||
}
|
||||
)
|
||||
|
||||
dc_geoids = {t["geoid"] for t in dc_tracts}
|
||||
non_dc = [u for u in universe if u["geoid"] not in dc_geoids]
|
||||
|
||||
# Restrict comparison to states actually represented in the DC sample
|
||||
dc_states = {t["statefp"] for t in dc_tracts}
|
||||
universe_in_dc_states = [u for u in universe if u["statefp"] in dc_states]
|
||||
non_dc_in_dc_states = [u for u in non_dc if u["statefp"] in dc_states]
|
||||
|
||||
# ============== report ==============
|
||||
print("=" * 72)
|
||||
print("TRACT-LEVEL CONCENTRATED COSTS / DISPERSED BENEFITS ANALYSIS")
|
||||
print("=" * 72)
|
||||
|
||||
total_dc = sum(t["dc_count"] for t in dc_tracts)
|
||||
print(f"\nDC-hosting tracts: {len(dc_tracts):,}")
|
||||
print(f"Data centers in those tracts: {total_dc:,}")
|
||||
print(f"ACS universe (selected states): {len(universe):,} tracts")
|
||||
print(f"States represented in DC sample: {len(dc_states)}")
|
||||
print(f"Universe restricted to DC states: {len(universe_in_dc_states):,} tracts")
|
||||
|
||||
# --- Cost concentration at the tract level ---
|
||||
print("\n" + "-" * 72)
|
||||
print("1. COST CONCENTRATION (DCs across tracts)")
|
||||
print("-" * 72)
|
||||
counts = [t["dc_count"] for t in dc_tracts]
|
||||
shares = [c / total_dc for c in counts]
|
||||
g_dc = gini(counts)
|
||||
h_dc = hhi(shares)
|
||||
print(f"Gini of DC counts across DC-hosting tracts: {g_dc:.3f}")
|
||||
print(f"HHI of DC shares across DC-hosting tracts: {h_dc:.4f}")
|
||||
# Top 1% / 5% of tracts share
|
||||
top1 = max(1, len(counts) // 100)
|
||||
top5 = max(1, len(counts) // 20)
|
||||
s = sorted(counts, reverse=True)
|
||||
print(f"Top 1% of DC-hosting tracts ({top1:>3} tracts) hold "
|
||||
f"{sum(s[:top1])/total_dc*100:5.1f}% of all DCs")
|
||||
print(f"Top 5% of DC-hosting tracts ({top5:>3} tracts) hold "
|
||||
f"{sum(s[:top5])/total_dc*100:5.1f}% of all DCs")
|
||||
print(f"Top 20% of DC-hosting tracts ({len(counts)//5:>3} tracts) hold "
|
||||
f"{sum(s[:len(counts)//5])/total_dc*100:5.1f}% of all DCs")
|
||||
|
||||
# How small a fraction of population lives in a DC tract?
|
||||
pop_dc = sum(t["pop"] or 0 for t in dc_tracts)
|
||||
pop_universe = sum(u["pop"] or 0 for u in universe_in_dc_states)
|
||||
print(f"\nPopulation living in a DC-hosting tract: {pop_dc:>11,}")
|
||||
print(f"Total population (DC-states ACS universe): {pop_universe:>11,}")
|
||||
if pop_universe:
|
||||
print(f" -> {pop_dc/pop_universe*100:.2f}% of people in DC-hosting states "
|
||||
f"live in a DC-hosting tract")
|
||||
# Per-capita DC density
|
||||
if pop_dc:
|
||||
print(f" -> 1 DC per {pop_dc/total_dc:,.0f} residents in DC-hosting tracts")
|
||||
if pop_universe and total_dc:
|
||||
print(f" vs. 1 DC per {pop_universe/total_dc:,.0f} residents "
|
||||
f"averaged across DC-state population")
|
||||
|
||||
# --- Profile of cost-bearing communities ---
|
||||
print("\n" + "-" * 72)
|
||||
print("2. WHO BEARS THE COSTS? (ACS profile of DC tracts vs. peer tracts)")
|
||||
print("-" * 72)
|
||||
fields = [
|
||||
("Median household income ($)", "mhi", "{:>10,.0f}"),
|
||||
("Per-capita income ($)", "pci", "{:>10,.0f}"),
|
||||
("Broadband subscription (%)", "broadband_pct", "{:>10,.1f}"),
|
||||
("Poverty rate (%)", "poverty", "{:>10,.1f}"),
|
||||
("Non-Hispanic White (%)", "white_pct", "{:>10,.1f}"),
|
||||
("Non-Hispanic Black (%)", "black_pct", "{:>10,.1f}"),
|
||||
("Hispanic/Latino (%)", "hisp_pct", "{:>10,.1f}"),
|
||||
("Non-Hispanic Asian (%)", "asian_pct", "{:>10,.1f}"),
|
||||
]
|
||||
label_w = max(len(lbl) for lbl, *_ in fields)
|
||||
print(f"{'Field':<{label_w}} {'DC tracts':>12} {'Non-DC peers':>14} "
|
||||
f"{'Δ (DC − peer)':>15}")
|
||||
for label, key, fmt in fields:
|
||||
dc_med = median([t[key] for t in dc_tracts])
|
||||
peer_med = median([u[key] for u in non_dc_in_dc_states])
|
||||
if dc_med is None or peer_med is None:
|
||||
continue
|
||||
delta = dc_med - peer_med
|
||||
cell_dc = fmt.format(dc_med)
|
||||
cell_pe = fmt.format(peer_med)
|
||||
cell_dl = fmt.format(delta)
|
||||
print(f"{label:<{label_w}} {cell_dc} {cell_pe} {cell_dl}")
|
||||
|
||||
print("\nPopulation-weighted means (DC tracts):")
|
||||
pops = [t["pop"] for t in dc_tracts]
|
||||
for label, key, _ in fields:
|
||||
wm = wmean([t[key] for t in dc_tracts], pops)
|
||||
if wm is not None:
|
||||
print(f" {label:<{label_w}} {wm:>12,.1f}")
|
||||
|
||||
print("\nPrimary-industry mix of DC-hosting tracts (count of tracts):")
|
||||
for industry, n in Counter(t["primary_industry"] for t in dc_tracts).most_common(10):
|
||||
print(f" {n:>4} {industry}")
|
||||
|
||||
# --- Cable vs. inland subgroups ---
|
||||
print("\n" + "-" * 72)
|
||||
print("3. CABLE-ADJACENT vs. INLAND DC TRACTS")
|
||||
print("-" * 72)
|
||||
near = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] <= 100]
|
||||
far = [t for t in dc_tracts if t.get("dist_km") is not None and t["dist_km"] > 100]
|
||||
print(f"≤100 km from a submarine cable: {len(near):>3} tracts, "
|
||||
f"{sum(t['dc_count'] for t in near):>4} DCs")
|
||||
print(f">100 km from a submarine cable: {len(far):>3} tracts, "
|
||||
f"{sum(t['dc_count'] for t in far):>4} DCs")
|
||||
if near and far:
|
||||
print(f"{' Median MHI':<28} near={median([t['mhi'] for t in near]):>10,.0f} "
|
||||
f"far={median([t['mhi'] for t in far]):>10,.0f}")
|
||||
print(f"{' Median broadband %':<28} near={median([t['broadband_pct'] for t in near]):>10,.1f} "
|
||||
f"far={median([t['broadband_pct'] for t in far]):>10,.1f}")
|
||||
print(f"{' Median DC count':<28} near={median([t['dc_count'] for t in near]):>10,.0f} "
|
||||
f"far={median([t['dc_count'] for t in far]):>10,.0f}")
|
||||
|
||||
# --- Benefit-side proxy ---
|
||||
print("\n" + "-" * 72)
|
||||
print("4. BENEFIT DISPERSION (broadband subscribers across all tracts)")
|
||||
print("-" * 72)
|
||||
# Total broadband subscribers approx = households * broadband_pct
|
||||
subs = []
|
||||
for u in universe_in_dc_states:
|
||||
hh = None # households not in CSV; use population/2.5 fallback
|
||||
if u["pop"] and u["broadband_pct"] is not None:
|
||||
est_hh = u["pop"] / 2.5
|
||||
subs.append(est_hh * u["broadband_pct"] / 100.0)
|
||||
total_subs = sum(subs)
|
||||
sg = gini(subs)
|
||||
sh = hhi([s / total_subs for s in subs]) if total_subs else None
|
||||
print(f"Estimated total broadband subscribers (DC states): {total_subs:>14,.0f}")
|
||||
print(f"Gini of subscribers across {len(subs):,} tracts: {sg:.3f}")
|
||||
print(f"HHI of subscribers across tracts: {sh:.5f}")
|
||||
# Compare to DC HHI
|
||||
print(f"\nSide-by-side concentration (lower = more dispersed):")
|
||||
print(f" HHI of DCs across DC-hosting tracts: {h_dc:.4f}")
|
||||
print(f" HHI of broadband subs across DC-state tracts: {sh:.5f} "
|
||||
f"({h_dc/sh:.0f}x more concentrated for DCs)")
|
||||
|
||||
print("\n" + "=" * 72)
|
||||
print("BOTTOM LINE")
|
||||
print("=" * 72)
|
||||
n_above_500 = sum(1 for t in dc_tracts if t["dc_count"] >= 5)
|
||||
print(f"""
|
||||
- DCs are extremely concentrated at the tract level: top 1% of host tracts
|
||||
hold {sum(s[:top1])/total_dc*100:.0f}% of all DCs; top 5% hold {sum(s[:top5])/total_dc*100:.0f}%.
|
||||
- Only {pop_dc/pop_universe*100:.2f}% of residents of the DC-hosting states actually
|
||||
live in a DC-hosting tract — costs (land use, power draw, water, traffic,
|
||||
noise) fall on a tiny minority of communities.
|
||||
- DC-hosting tracts skew {'wealthier' if median([t['mhi'] for t in dc_tracts]) > median([u['mhi'] for u in non_dc_in_dc_states]) else 'poorer'} and {'higher' if median([t['broadband_pct'] for t in dc_tracts]) > median([u['broadband_pct'] for u in non_dc_in_dc_states]) else 'lower'} broadband than peer
|
||||
tracts. See deltas above for the demographic profile.
|
||||
- Broadband subscribers (proxy for who consumes cloud services) are far more
|
||||
evenly distributed than DCs — HHI roughly {h_dc/sh:.0f}× lower.
|
||||
That asymmetry IS the classic concentrated-cost / dispersed-benefit shape.
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
397
scripts/build_fcc_bdc_broadband_connection_table.py
Normal file
397
scripts/build_fcc_bdc_broadband_connection_table.py
Normal file
@@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build data-center broadband connection tables.
|
||||
|
||||
Creates a per-data-center broadband connection table and, when FCC BDC API
|
||||
credentials are available, stores the FCC BDC public download catalog.
|
||||
|
||||
Required DB env vars:
|
||||
PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD
|
||||
|
||||
FCC API env vars:
|
||||
FCC_USERNAME or FCC_BDC_USERNAME - FCC User Registration username/email
|
||||
FCC_API_KEY or FCC_HASH_VALUE - BDC public API hash_value token
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import date, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from psycopg2.extras import Json, execute_values
|
||||
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
|
||||
MASTER_TABLE = "public.master_data_centers"
|
||||
TRACT_TABLE = "public.data_center_census_tracts_2024"
|
||||
AS_OF_TABLE = "public.fcc_bdc_api_as_of_dates"
|
||||
FILES_TABLE = "public.fcc_bdc_availability_files"
|
||||
CONNECTION_TABLE = "public.data_center_broadband_connection"
|
||||
|
||||
FCC_BASE_URL = "https://broadbandmap.fcc.gov/api/public"
|
||||
USER_AGENT = "data-center-fcc-bdc-loader/1.0"
|
||||
|
||||
|
||||
def load_zsh_secrets() -> None:
|
||||
"""Load shell secrets into this process without printing values."""
|
||||
secrets = Path.home() / ".zsh_secrets"
|
||||
if not secrets.exists():
|
||||
return
|
||||
|
||||
result = subprocess.run(
|
||||
["zsh", "-lc", "source ~/.zsh_secrets >/dev/null 2>&1; env"],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
for line in result.stdout.splitlines():
|
||||
if "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
if key and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
def require_env(keys: list[str]) -> None:
|
||||
missing = [k for k in keys if not os.getenv(k)]
|
||||
if missing:
|
||||
raise RuntimeError("Missing required env vars: " + ", ".join(missing))
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname="data_centers",
|
||||
)
|
||||
|
||||
|
||||
def fcc_credentials() -> tuple[str | None, str | None]:
|
||||
username = os.getenv("FCC_USERNAME") or os.getenv("FCC_BDC_USERNAME")
|
||||
hash_value = os.getenv("FCC_API_KEY") or os.getenv("FCC_HASH_VALUE")
|
||||
return username, hash_value
|
||||
|
||||
|
||||
def fcc_get(path: str, *, params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
username, hash_value = fcc_credentials()
|
||||
if not username or not hash_value:
|
||||
raise RuntimeError(
|
||||
"FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
|
||||
"FCC_API_KEY or FCC_HASH_VALUE."
|
||||
)
|
||||
|
||||
url = f"{FCC_BASE_URL}{path}"
|
||||
headers = {
|
||||
"username": username,
|
||||
"hash_value": hash_value,
|
||||
"user-agent": USER_AGENT,
|
||||
"accept": "application/json",
|
||||
}
|
||||
response = requests.get(url, headers=headers, params=params or {}, timeout=60)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
if str(payload.get("status_code")) in {"401", "403"} or payload.get("status") == "fail":
|
||||
raise RuntimeError(f"FCC API error for {path}: {payload}")
|
||||
return payload
|
||||
|
||||
|
||||
def parse_date(value: Any) -> date | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
if isinstance(value, date):
|
||||
return value
|
||||
return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()
|
||||
|
||||
|
||||
def to_int(value: Any) -> int | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
try:
|
||||
return int(str(value).replace(",", ""))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def create_tables(cur) -> None:
|
||||
cur.execute("create extension if not exists postgis")
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {AS_OF_TABLE} (
|
||||
data_type text not null,
|
||||
as_of_date date not null,
|
||||
raw jsonb not null,
|
||||
fetched_at timestamptz not null default now(),
|
||||
primary key (data_type, as_of_date)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {FILES_TABLE} (
|
||||
as_of_date date not null,
|
||||
file_id bigint not null,
|
||||
category text,
|
||||
subcategory text,
|
||||
technology_type text,
|
||||
technology_code text,
|
||||
technology_code_desc text,
|
||||
speed_tier text,
|
||||
state_fips text,
|
||||
state_name text,
|
||||
provider_id bigint,
|
||||
provider_name text,
|
||||
file_type text,
|
||||
file_name text,
|
||||
record_count bigint,
|
||||
raw jsonb not null,
|
||||
fetched_at timestamptz not null default now(),
|
||||
primary key (as_of_date, file_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists fcc_bdc_availability_files_category_idx "
|
||||
f"on {FILES_TABLE} (category, subcategory)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists fcc_bdc_availability_files_state_idx "
|
||||
f"on {FILES_TABLE} (state_fips)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists fcc_bdc_availability_files_provider_idx "
|
||||
f"on {FILES_TABLE} (provider_id)"
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {CONNECTION_TABLE} (
|
||||
master_id text primary key references public.master_data_centers(master_id) on delete cascade,
|
||||
source text,
|
||||
name text,
|
||||
operator text,
|
||||
city text,
|
||||
state text,
|
||||
country text,
|
||||
longitude double precision,
|
||||
latitude double precision,
|
||||
geom geometry(Point, 4326),
|
||||
|
||||
census_tract_geoid text,
|
||||
census_broadband_subscription_pct numeric,
|
||||
|
||||
fcc_bdc_status text not null,
|
||||
fcc_bdc_as_of_date date,
|
||||
fcc_bdc_geography_type text,
|
||||
fcc_bdc_geoid text,
|
||||
|
||||
fcc_provider_count integer,
|
||||
fcc_fiber_provider_count integer,
|
||||
fcc_cable_provider_count integer,
|
||||
fcc_fixed_wireless_provider_count integer,
|
||||
fcc_max_advertised_download_mbps numeric,
|
||||
fcc_max_advertised_upload_mbps numeric,
|
||||
fcc_100_20_provider_count integer,
|
||||
fcc_summary_json jsonb,
|
||||
|
||||
fetched_at timestamptz not null default now(),
|
||||
updated_at timestamptz not null default now()
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists data_center_broadband_connection_geom_gix "
|
||||
f"on {CONNECTION_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists data_center_broadband_connection_tract_idx "
|
||||
f"on {CONNECTION_TABLE} (census_tract_geoid)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists data_center_broadband_connection_status_idx "
|
||||
f"on {CONNECTION_TABLE} (fcc_bdc_status)"
|
||||
)
|
||||
|
||||
|
||||
def rebuild_connection_base(cur, status: str) -> int:
|
||||
cur.execute(f"truncate {CONNECTION_TABLE}")
|
||||
cur.execute(
|
||||
f"""
|
||||
insert into {CONNECTION_TABLE} (
|
||||
master_id, source, name, operator, city, state, country,
|
||||
longitude, latitude, geom,
|
||||
census_tract_geoid, census_broadband_subscription_pct,
|
||||
fcc_bdc_status
|
||||
)
|
||||
select
|
||||
dc.master_id, dc.source, dc.name, dc.operator, dc.city, dc.state, dc.country,
|
||||
dc.longitude, dc.latitude, dc.geom,
|
||||
dc.geoid as census_tract_geoid,
|
||||
tr.broadband_subscription_pct as census_broadband_subscription_pct,
|
||||
%s as fcc_bdc_status
|
||||
from {MASTER_TABLE} dc
|
||||
left join {TRACT_TABLE} tr on tr.geoid::text = dc.geoid::text
|
||||
"""
|
||||
,
|
||||
(status,),
|
||||
)
|
||||
cur.execute(f"select count(*) from {CONNECTION_TABLE}")
|
||||
return cur.fetchone()[0]
|
||||
|
||||
|
||||
def latest_availability_date(rows: list[dict[str, Any]]) -> date | None:
|
||||
dates = [
|
||||
parse_date(r.get("as_of_date"))
|
||||
for r in rows
|
||||
if str(r.get("data_type", "")).lower() in {"availability", "availability data"}
|
||||
]
|
||||
dates = [d for d in dates if d is not None]
|
||||
return max(dates) if dates else None
|
||||
|
||||
|
||||
def load_as_of_dates(cur) -> date:
|
||||
payload = fcc_get("/map/listAsOfDates")
|
||||
rows = payload.get("data") or []
|
||||
values = []
|
||||
for row in rows:
|
||||
as_of_date = parse_date(row.get("as_of_date"))
|
||||
if not as_of_date:
|
||||
continue
|
||||
values.append((row.get("data_type"), as_of_date, Json(row)))
|
||||
|
||||
if values:
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {AS_OF_TABLE} (data_type, as_of_date, raw)
|
||||
values %s
|
||||
on conflict (data_type, as_of_date) do update set
|
||||
raw = excluded.raw,
|
||||
fetched_at = now()
|
||||
""",
|
||||
values,
|
||||
page_size=1000,
|
||||
)
|
||||
|
||||
latest = latest_availability_date(rows)
|
||||
if latest is None:
|
||||
raise RuntimeError(f"Could not find an availability as_of_date in FCC response: {rows}")
|
||||
return latest
|
||||
|
||||
|
||||
def load_availability_file_catalog(cur, as_of_date: date) -> int:
|
||||
payload = fcc_get(
|
||||
f"/map/downloads/listAvailabilityData/{as_of_date:%Y-%m-%d}",
|
||||
params={"technology_type": "Fixed Broadband"},
|
||||
)
|
||||
rows = payload.get("data") or []
|
||||
values = []
|
||||
for row in rows:
|
||||
file_id = to_int(row.get("file_id"))
|
||||
if file_id is None:
|
||||
continue
|
||||
values.append(
|
||||
(
|
||||
as_of_date,
|
||||
file_id,
|
||||
row.get("category"),
|
||||
row.get("subcategory"),
|
||||
row.get("technology_type"),
|
||||
row.get("technology_code"),
|
||||
row.get("technology_code_desc"),
|
||||
row.get("speed_tier"),
|
||||
row.get("state_fips"),
|
||||
row.get("state_name"),
|
||||
to_int(row.get("provider_id")),
|
||||
row.get("provider_name"),
|
||||
row.get("file_type"),
|
||||
row.get("file_name"),
|
||||
to_int(row.get("record_count")),
|
||||
Json(row),
|
||||
)
|
||||
)
|
||||
|
||||
if values:
|
||||
cur.execute(f"delete from {FILES_TABLE} where as_of_date = %s", (as_of_date,))
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {FILES_TABLE} (
|
||||
as_of_date, file_id, category, subcategory, technology_type,
|
||||
technology_code, technology_code_desc, speed_tier, state_fips,
|
||||
state_name, provider_id, provider_name, file_type, file_name,
|
||||
record_count, raw
|
||||
)
|
||||
values %s
|
||||
""",
|
||||
values,
|
||||
page_size=1000,
|
||||
)
|
||||
return len(values)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--skip-fcc", action="store_true", help="Only create/rebuild the base connection table.")
|
||||
parser.add_argument("--as-of-date", help="FCC BDC availability as-of date, YYYY-MM-DD. Defaults to latest.")
|
||||
args = parser.parse_args()
|
||||
|
||||
load_zsh_secrets()
|
||||
require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
|
||||
|
||||
username, hash_value = fcc_credentials()
|
||||
status = "pending_fcc_username" if hash_value and not username else "pending_fcc_catalog"
|
||||
if args.skip_fcc:
|
||||
status = "fcc_skipped"
|
||||
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
create_tables(cur)
|
||||
n_connection = rebuild_connection_base(cur, status)
|
||||
print(f"{CONNECTION_TABLE}: {n_connection:,} base rows")
|
||||
|
||||
if args.skip_fcc:
|
||||
conn.commit()
|
||||
return 0
|
||||
|
||||
if not username or not hash_value:
|
||||
print(
|
||||
"FCC catalog not loaded: set FCC_USERNAME or FCC_BDC_USERNAME "
|
||||
"alongside FCC_API_KEY/FCC_HASH_VALUE in ~/.zsh_secrets.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
conn.commit()
|
||||
return 2
|
||||
|
||||
as_of_date = parse_date(args.as_of_date) if args.as_of_date else load_as_of_dates(cur)
|
||||
n_files = load_availability_file_catalog(cur, as_of_date)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
update {CONNECTION_TABLE}
|
||||
set fcc_bdc_status = 'fcc_catalog_loaded',
|
||||
fcc_bdc_as_of_date = %s,
|
||||
updated_at = now()
|
||||
""",
|
||||
(as_of_date,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
print(f"{AS_OF_TABLE}: loaded latest availability date {as_of_date}")
|
||||
print(f"{FILES_TABLE}: {n_files:,} fixed-broadband file catalog rows")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
806
scripts/build_fcc_bdc_location_provider_aggregates.py
Normal file
806
scripts/build_fcc_bdc_location_provider_aggregates.py
Normal file
@@ -0,0 +1,806 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build FCC BDC provider aggregates for data-center counties and tracts.
|
||||
|
||||
This script uses FCC BDC State / Location Coverage files. Those files are
|
||||
provider/location-level and include block GEOIDs, so they can be aggregated to
|
||||
county and tract provider counts for only the geographies that contain data
|
||||
centers.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from collections.abc import Iterable
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
from build_fcc_bdc_broadband_connection_table import (
|
||||
CONNECTION_TABLE,
|
||||
FCC_BASE_URL,
|
||||
FILES_TABLE,
|
||||
USER_AGENT,
|
||||
fcc_credentials,
|
||||
get_conn,
|
||||
load_zsh_secrets,
|
||||
parse_date,
|
||||
require_env,
|
||||
)
|
||||
|
||||
|
||||
DETAIL_TABLE = "public.fcc_bdc_location_provider_geography_provider"
|
||||
AGG_TABLE = "public.fcc_bdc_location_provider_aggregate"
|
||||
PROGRESS_TABLE = "public.fcc_bdc_location_provider_file_progress"
|
||||
CROSSWALK_TABLE = "public.fcc_bdc_geoid_crosswalk"
|
||||
|
||||
TERRESTRIAL_TECHNOLOGY_CODES = ("10", "40", "50", "70", "71", "72")
|
||||
FIXED_WIRELESS_CODES = {"70", "71", "72"}
|
||||
|
||||
CSV_USECOLS = [
|
||||
"provider_id",
|
||||
"block_geoid",
|
||||
"technology",
|
||||
"max_advertised_download_speed",
|
||||
"max_advertised_upload_speed",
|
||||
"business_residential_code",
|
||||
]
|
||||
|
||||
CT_PLANNING_TO_LEGACY_TRACT_GEOIDS = {
|
||||
"09110520302": "09003520302",
|
||||
"09120090500": "09001090500",
|
||||
"09170175800": "09009175800",
|
||||
"09190020101": "09001020101",
|
||||
"09190020900": "09001020900",
|
||||
"09190044300": "09001044300",
|
||||
}
|
||||
|
||||
|
||||
def fcc_download_headers() -> dict[str, str]:
|
||||
username, hash_value = fcc_credentials()
|
||||
if not username or not hash_value:
|
||||
raise RuntimeError(
|
||||
"FCC BDC API requires FCC_USERNAME or FCC_BDC_USERNAME plus "
|
||||
"FCC_API_KEY or FCC_HASH_VALUE."
|
||||
)
|
||||
return {
|
||||
"username": username,
|
||||
"hash_value": hash_value,
|
||||
"user-agent": USER_AGENT,
|
||||
"accept": "application/zip,*/*",
|
||||
}
|
||||
|
||||
|
||||
def normalize_codes(values: Iterable[str]) -> tuple[str, ...]:
|
||||
return tuple(str(v).strip() for v in values if str(v).strip())
|
||||
|
||||
|
||||
def create_tables(cur) -> None:
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {DETAIL_TABLE} (
|
||||
as_of_date date not null,
|
||||
file_id bigint not null,
|
||||
geography_type text not null check (geography_type in ('County', 'Tract')),
|
||||
geoid text not null,
|
||||
provider_id bigint not null,
|
||||
has_fiber boolean not null default false,
|
||||
has_cable boolean not null default false,
|
||||
has_fixed_wireless boolean not null default false,
|
||||
has_copper boolean not null default false,
|
||||
has_100_20 boolean not null default false,
|
||||
has_business boolean not null default false,
|
||||
has_business_fiber boolean not null default false,
|
||||
has_business_100_20 boolean not null default false,
|
||||
max_advertised_download_mbps numeric,
|
||||
max_advertised_upload_mbps numeric,
|
||||
matched_location_rows bigint not null default 0,
|
||||
updated_at timestamptz not null default now(),
|
||||
primary key (as_of_date, file_id, geography_type, geoid, provider_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists fcc_bdc_location_provider_geo_idx "
|
||||
f"on {DETAIL_TABLE} (as_of_date, geography_type, geoid)"
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {AGG_TABLE} (
|
||||
as_of_date date not null,
|
||||
geography_type text not null check (geography_type in ('County', 'Tract')),
|
||||
geoid text not null,
|
||||
provider_count integer not null,
|
||||
fiber_provider_count integer not null,
|
||||
cable_provider_count integer not null,
|
||||
fixed_wireless_provider_count integer not null,
|
||||
copper_provider_count integer not null,
|
||||
provider_100_20_count integer not null,
|
||||
business_provider_count integer not null,
|
||||
business_fiber_provider_count integer not null,
|
||||
business_100_20_provider_count integer not null,
|
||||
max_advertised_download_mbps numeric,
|
||||
max_advertised_upload_mbps numeric,
|
||||
matched_location_rows bigint not null,
|
||||
provider_file_rows bigint not null,
|
||||
updated_at timestamptz not null default now(),
|
||||
primary key (as_of_date, geography_type, geoid)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {PROGRESS_TABLE} (
|
||||
as_of_date date not null,
|
||||
file_id bigint not null,
|
||||
state_fips text not null,
|
||||
technology_code text,
|
||||
technology_code_desc text,
|
||||
record_count bigint,
|
||||
matched_location_rows bigint not null,
|
||||
provider_geo_rows bigint not null,
|
||||
processed_at timestamptz not null default now(),
|
||||
primary key (as_of_date, file_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table if not exists {CROSSWALK_TABLE} (
|
||||
source_geography_type text not null,
|
||||
source_geoid text not null,
|
||||
fcc_geography_type text not null,
|
||||
fcc_geoid text not null,
|
||||
method text not null,
|
||||
notes text,
|
||||
updated_at timestamptz not null default now(),
|
||||
primary key (source_geography_type, source_geoid, fcc_geography_type)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
add_columns = [
|
||||
"fcc_provider_geography_type text",
|
||||
"fcc_provider_geoid text",
|
||||
"fcc_county_provider_count integer",
|
||||
"fcc_county_fiber_provider_count integer",
|
||||
"fcc_county_cable_provider_count integer",
|
||||
"fcc_county_fixed_wireless_provider_count integer",
|
||||
"fcc_county_100_20_provider_count integer",
|
||||
"fcc_county_business_provider_count integer",
|
||||
"fcc_county_business_fiber_provider_count integer",
|
||||
"fcc_county_business_100_20_provider_count integer",
|
||||
"fcc_county_max_advertised_download_mbps numeric",
|
||||
"fcc_county_max_advertised_upload_mbps numeric",
|
||||
"fcc_tract_provider_count integer",
|
||||
"fcc_tract_fiber_provider_count integer",
|
||||
"fcc_tract_cable_provider_count integer",
|
||||
"fcc_tract_fixed_wireless_provider_count integer",
|
||||
"fcc_tract_100_20_provider_count integer",
|
||||
"fcc_tract_business_provider_count integer",
|
||||
"fcc_tract_business_fiber_provider_count integer",
|
||||
"fcc_tract_business_100_20_provider_count integer",
|
||||
"fcc_tract_max_advertised_download_mbps numeric",
|
||||
"fcc_tract_max_advertised_upload_mbps numeric",
|
||||
]
|
||||
for definition in add_columns:
|
||||
cur.execute(f"alter table {CONNECTION_TABLE} add column if not exists {definition}")
|
||||
|
||||
|
||||
def seed_geoid_crosswalk(cur) -> None:
|
||||
values = [
|
||||
(
|
||||
"Tract",
|
||||
source_geoid,
|
||||
"Tract",
|
||||
fcc_geoid,
|
||||
"ct_planning_region_to_legacy_county_same_tractce",
|
||||
"Connecticut 2024 tract GEOIDs use planning-region county equivalents; FCC BDC block GEOIDs use legacy county codes.",
|
||||
)
|
||||
for source_geoid, fcc_geoid in CT_PLANNING_TO_LEGACY_TRACT_GEOIDS.items()
|
||||
]
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {CROSSWALK_TABLE} (
|
||||
source_geography_type, source_geoid, fcc_geography_type,
|
||||
fcc_geoid, method, notes
|
||||
)
|
||||
values %s
|
||||
on conflict (source_geography_type, source_geoid, fcc_geography_type)
|
||||
do update set
|
||||
fcc_geoid = excluded.fcc_geoid,
|
||||
method = excluded.method,
|
||||
notes = excluded.notes,
|
||||
updated_at = now()
|
||||
""",
|
||||
values,
|
||||
)
|
||||
|
||||
|
||||
def latest_catalog_date(cur) -> date:
|
||||
cur.execute(f"select max(as_of_date) from {FILES_TABLE}")
|
||||
value = cur.fetchone()[0]
|
||||
if value is None:
|
||||
raise RuntimeError(f"No FCC catalog rows found in {FILES_TABLE}. Run the FCC catalog load first.")
|
||||
return value
|
||||
|
||||
|
||||
def target_geographies(cur, states: tuple[str, ...] | None = None) -> tuple[set[str], set[str], set[str]]:
|
||||
state_filter = ""
|
||||
params: list[Any] = []
|
||||
if states:
|
||||
state_filter = "where left(census_tract_geoid, 2) = any(%s)"
|
||||
params.append(list(states))
|
||||
cur.execute(
|
||||
f"""
|
||||
select distinct
|
||||
left(census_tract_geoid, 2) as state_fips,
|
||||
left(census_tract_geoid, 5) as county_geoid,
|
||||
left(census_tract_geoid, 11) as tract_geoid
|
||||
from {CONNECTION_TABLE}
|
||||
{state_filter}
|
||||
""",
|
||||
params,
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
states_found = {r[0] for r in rows if r[0]}
|
||||
counties = {r[1] for r in rows if r[1]}
|
||||
tracts = {r[2] for r in rows if r[2]}
|
||||
|
||||
if tracts:
|
||||
cur.execute(
|
||||
f"""
|
||||
select fcc_geoid
|
||||
from {CROSSWALK_TABLE}
|
||||
where source_geography_type = 'Tract'
|
||||
and fcc_geography_type = 'Tract'
|
||||
and source_geoid = any(%s)
|
||||
""",
|
||||
(list(tracts),),
|
||||
)
|
||||
fcc_tracts = {r[0] for r in cur.fetchall() if r[0]}
|
||||
tracts.update(fcc_tracts)
|
||||
counties.update({geoid[:5] for geoid in fcc_tracts})
|
||||
|
||||
return states_found, counties, tracts
|
||||
|
||||
|
||||
def catalog_files(
|
||||
cur,
|
||||
as_of_date: date,
|
||||
states: set[str],
|
||||
technology_codes: tuple[str, ...],
|
||||
limit: int | None,
|
||||
) -> list[dict[str, Any]]:
|
||||
cur.execute(
|
||||
f"""
|
||||
select file_id, state_fips, technology_code, technology_code_desc, file_name, record_count
|
||||
from {FILES_TABLE}
|
||||
where as_of_date = %s
|
||||
and category = 'State'
|
||||
and subcategory = 'Location Coverage'
|
||||
and state_fips = any(%s)
|
||||
and technology_code = any(%s)
|
||||
order by state_fips, technology_code, file_id
|
||||
""",
|
||||
(as_of_date, list(states), list(technology_codes)),
|
||||
)
|
||||
rows = [
|
||||
{
|
||||
"file_id": int(file_id),
|
||||
"state_fips": state_fips,
|
||||
"technology_code": str(technology_code),
|
||||
"technology_code_desc": technology_code_desc,
|
||||
"file_name": file_name,
|
||||
"record_count": record_count,
|
||||
}
|
||||
for file_id, state_fips, technology_code, technology_code_desc, file_name, record_count in cur.fetchall()
|
||||
]
|
||||
return rows[:limit] if limit is not None else rows
|
||||
|
||||
|
||||
def progress_done(cur, as_of_date: date, file_id: int) -> bool:
|
||||
cur.execute(
|
||||
f"select 1 from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
|
||||
(as_of_date, file_id),
|
||||
)
|
||||
return cur.fetchone() is not None
|
||||
|
||||
|
||||
def download_file(file_id: int, dest_dir: Path) -> Path:
|
||||
url = f"{FCC_BASE_URL}/map/downloads/downloadFile/availability/{file_id}"
|
||||
path = dest_dir / f"fcc_bdc_availability_{file_id}.zip"
|
||||
with requests.get(url, headers=fcc_download_headers(), stream=True, timeout=(15, 300)) as response:
|
||||
response.raise_for_status()
|
||||
with path.open("wb") as fh:
|
||||
for chunk in response.iter_content(chunk_size=1024 * 1024):
|
||||
if chunk:
|
||||
fh.write(chunk)
|
||||
return path
|
||||
|
||||
|
||||
def normalize_block_geoid(series: pd.Series) -> pd.Series:
|
||||
return series.astype("string").str.replace(r"\.0$", "", regex=True).str.zfill(15)
|
||||
|
||||
|
||||
def summarize_matches(
|
||||
chunk: pd.DataFrame,
|
||||
geography_type: str,
|
||||
target_geoids: set[str],
|
||||
) -> tuple[pd.DataFrame, int]:
|
||||
geoid_len = 5 if geography_type == "County" else 11
|
||||
geoid = chunk["block_geoid_norm"].str[:geoid_len]
|
||||
matched = chunk[geoid.isin(target_geoids)].copy()
|
||||
if matched.empty:
|
||||
return pd.DataFrame(), 0
|
||||
|
||||
matched["geoid"] = geoid[matched.index]
|
||||
matched["provider_id_num"] = pd.to_numeric(matched["provider_id"], errors="coerce")
|
||||
matched = matched[matched["provider_id_num"].notna()].copy()
|
||||
if matched.empty:
|
||||
return pd.DataFrame(), 0
|
||||
|
||||
tech = matched["technology"].astype("string").str.replace(r"\.0$", "", regex=True)
|
||||
down = pd.to_numeric(matched["max_advertised_download_speed"], errors="coerce")
|
||||
upload = pd.to_numeric(matched["max_advertised_upload_speed"], errors="coerce")
|
||||
business_code = matched["business_residential_code"].astype("string").str.upper().fillna("")
|
||||
business = business_code.isin(["B", "X"])
|
||||
|
||||
matched["provider_id_num"] = matched["provider_id_num"].astype("int64")
|
||||
matched["has_fiber"] = tech.eq("50")
|
||||
matched["has_cable"] = tech.eq("40")
|
||||
matched["has_fixed_wireless"] = tech.isin(FIXED_WIRELESS_CODES)
|
||||
matched["has_copper"] = tech.eq("10")
|
||||
matched["has_100_20"] = down.ge(100) & upload.ge(20)
|
||||
matched["has_business"] = business
|
||||
matched["has_business_fiber"] = business & matched["has_fiber"]
|
||||
matched["has_business_100_20"] = business & matched["has_100_20"]
|
||||
matched["max_down"] = down
|
||||
matched["max_up"] = upload
|
||||
matched["matched_location_rows"] = 1
|
||||
|
||||
grouped = (
|
||||
matched.groupby(["geoid", "provider_id_num"], as_index=False)
|
||||
.agg(
|
||||
has_fiber=("has_fiber", "max"),
|
||||
has_cable=("has_cable", "max"),
|
||||
has_fixed_wireless=("has_fixed_wireless", "max"),
|
||||
has_copper=("has_copper", "max"),
|
||||
has_100_20=("has_100_20", "max"),
|
||||
has_business=("has_business", "max"),
|
||||
has_business_fiber=("has_business_fiber", "max"),
|
||||
has_business_100_20=("has_business_100_20", "max"),
|
||||
max_down=("max_down", "max"),
|
||||
max_up=("max_up", "max"),
|
||||
matched_location_rows=("matched_location_rows", "sum"),
|
||||
)
|
||||
)
|
||||
return grouped, len(matched)
|
||||
|
||||
|
||||
def upsert_detail(
|
||||
cur,
|
||||
as_of_date: date,
|
||||
file_id: int,
|
||||
geography_type: str,
|
||||
grouped: pd.DataFrame,
|
||||
) -> int:
|
||||
if grouped.empty:
|
||||
return 0
|
||||
|
||||
values = [
|
||||
(
|
||||
as_of_date,
|
||||
file_id,
|
||||
geography_type,
|
||||
row.geoid,
|
||||
int(row.provider_id_num),
|
||||
bool(row.has_fiber),
|
||||
bool(row.has_cable),
|
||||
bool(row.has_fixed_wireless),
|
||||
bool(row.has_copper),
|
||||
bool(row.has_100_20),
|
||||
bool(row.has_business),
|
||||
bool(row.has_business_fiber),
|
||||
bool(row.has_business_100_20),
|
||||
None if pd.isna(row.max_down) else float(row.max_down),
|
||||
None if pd.isna(row.max_up) else float(row.max_up),
|
||||
int(row.matched_location_rows),
|
||||
)
|
||||
for row in grouped.itertuples(index=False)
|
||||
]
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {DETAIL_TABLE} (
|
||||
as_of_date, file_id, geography_type, geoid, provider_id,
|
||||
has_fiber, has_cable, has_fixed_wireless, has_copper,
|
||||
has_100_20, has_business, has_business_fiber, has_business_100_20,
|
||||
max_advertised_download_mbps, max_advertised_upload_mbps,
|
||||
matched_location_rows
|
||||
)
|
||||
values %s
|
||||
on conflict (as_of_date, file_id, geography_type, geoid, provider_id)
|
||||
do update set
|
||||
has_fiber = {DETAIL_TABLE}.has_fiber or excluded.has_fiber,
|
||||
has_cable = {DETAIL_TABLE}.has_cable or excluded.has_cable,
|
||||
has_fixed_wireless = {DETAIL_TABLE}.has_fixed_wireless or excluded.has_fixed_wireless,
|
||||
has_copper = {DETAIL_TABLE}.has_copper or excluded.has_copper,
|
||||
has_100_20 = {DETAIL_TABLE}.has_100_20 or excluded.has_100_20,
|
||||
has_business = {DETAIL_TABLE}.has_business or excluded.has_business,
|
||||
has_business_fiber = {DETAIL_TABLE}.has_business_fiber or excluded.has_business_fiber,
|
||||
has_business_100_20 = {DETAIL_TABLE}.has_business_100_20 or excluded.has_business_100_20,
|
||||
max_advertised_download_mbps = greatest(
|
||||
{DETAIL_TABLE}.max_advertised_download_mbps,
|
||||
excluded.max_advertised_download_mbps
|
||||
),
|
||||
max_advertised_upload_mbps = greatest(
|
||||
{DETAIL_TABLE}.max_advertised_upload_mbps,
|
||||
excluded.max_advertised_upload_mbps
|
||||
),
|
||||
matched_location_rows = {DETAIL_TABLE}.matched_location_rows + excluded.matched_location_rows,
|
||||
updated_at = now()
|
||||
""",
|
||||
values,
|
||||
page_size=1000,
|
||||
)
|
||||
return len(values)
|
||||
|
||||
|
||||
def process_file(
|
||||
conn,
|
||||
file_row: dict[str, Any],
|
||||
as_of_date: date,
|
||||
county_geoids: set[str],
|
||||
tract_geoids: set[str],
|
||||
chunksize: int,
|
||||
temp_dir: Path,
|
||||
) -> tuple[int, int]:
|
||||
file_id = file_row["file_id"]
|
||||
zip_path = download_file(file_id, temp_dir)
|
||||
matched_rows = 0
|
||||
provider_geo_rows = 0
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(zip_path) as archive:
|
||||
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
|
||||
if not csv_names:
|
||||
raise RuntimeError(f"FCC file_id={file_id} did not contain a CSV: {archive.namelist()}")
|
||||
with archive.open(csv_names[0]) as csv_file:
|
||||
reader = pd.read_csv(
|
||||
csv_file,
|
||||
usecols=CSV_USECOLS,
|
||||
dtype="string",
|
||||
chunksize=chunksize,
|
||||
low_memory=False,
|
||||
)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"delete from {DETAIL_TABLE} where as_of_date = %s and file_id = %s",
|
||||
(as_of_date, file_id),
|
||||
)
|
||||
cur.execute(
|
||||
f"delete from {PROGRESS_TABLE} where as_of_date = %s and file_id = %s",
|
||||
(as_of_date, file_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
for chunk_number, chunk in enumerate(reader, start=1):
|
||||
chunk["block_geoid_norm"] = normalize_block_geoid(chunk["block_geoid"])
|
||||
|
||||
county_grouped, county_matches = summarize_matches(chunk, "County", county_geoids)
|
||||
tract_grouped, tract_matches = summarize_matches(chunk, "Tract", tract_geoids)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "County", county_grouped)
|
||||
provider_geo_rows += upsert_detail(cur, as_of_date, file_id, "Tract", tract_grouped)
|
||||
conn.commit()
|
||||
|
||||
matched_rows += county_matches + tract_matches
|
||||
if matched_rows and chunk_number % 10 == 0:
|
||||
print(f" file_id={file_id}: chunk {chunk_number:,}, matched row-events={matched_rows:,}")
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
insert into {PROGRESS_TABLE} (
|
||||
as_of_date, file_id, state_fips, technology_code,
|
||||
technology_code_desc, record_count, matched_location_rows, provider_geo_rows
|
||||
)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
on conflict (as_of_date, file_id) do update set
|
||||
state_fips = excluded.state_fips,
|
||||
technology_code = excluded.technology_code,
|
||||
technology_code_desc = excluded.technology_code_desc,
|
||||
record_count = excluded.record_count,
|
||||
matched_location_rows = excluded.matched_location_rows,
|
||||
provider_geo_rows = excluded.provider_geo_rows,
|
||||
processed_at = now()
|
||||
""",
|
||||
(
|
||||
as_of_date,
|
||||
file_id,
|
||||
file_row["state_fips"],
|
||||
file_row["technology_code"],
|
||||
file_row["technology_code_desc"],
|
||||
file_row["record_count"],
|
||||
matched_rows,
|
||||
provider_geo_rows,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return matched_rows, provider_geo_rows
|
||||
finally:
|
||||
zip_path.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def rebuild_aggregate(cur, as_of_date: date) -> int:
|
||||
cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
|
||||
cur.execute(
|
||||
f"""
|
||||
insert into {AGG_TABLE} (
|
||||
as_of_date, geography_type, geoid,
|
||||
provider_count, fiber_provider_count, cable_provider_count,
|
||||
fixed_wireless_provider_count, copper_provider_count,
|
||||
provider_100_20_count, business_provider_count,
|
||||
business_fiber_provider_count, business_100_20_provider_count,
|
||||
max_advertised_download_mbps, max_advertised_upload_mbps,
|
||||
matched_location_rows, provider_file_rows
|
||||
)
|
||||
with per_provider as (
|
||||
select
|
||||
as_of_date,
|
||||
geography_type,
|
||||
geoid,
|
||||
provider_id,
|
||||
bool_or(has_fiber) as has_fiber,
|
||||
bool_or(has_cable) as has_cable,
|
||||
bool_or(has_fixed_wireless) as has_fixed_wireless,
|
||||
bool_or(has_copper) as has_copper,
|
||||
bool_or(has_100_20) as has_100_20,
|
||||
bool_or(has_business) as has_business,
|
||||
bool_or(has_business_fiber) as has_business_fiber,
|
||||
bool_or(has_business_100_20) as has_business_100_20,
|
||||
max(max_advertised_download_mbps) as max_advertised_download_mbps,
|
||||
max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
|
||||
sum(matched_location_rows) as matched_location_rows,
|
||||
count(*) as provider_file_rows
|
||||
from {DETAIL_TABLE}
|
||||
where as_of_date = %s
|
||||
group by 1, 2, 3, 4
|
||||
)
|
||||
select
|
||||
as_of_date,
|
||||
geography_type,
|
||||
geoid,
|
||||
count(*)::integer as provider_count,
|
||||
count(*) filter (where has_fiber)::integer as fiber_provider_count,
|
||||
count(*) filter (where has_cable)::integer as cable_provider_count,
|
||||
count(*) filter (where has_fixed_wireless)::integer as fixed_wireless_provider_count,
|
||||
count(*) filter (where has_copper)::integer as copper_provider_count,
|
||||
count(*) filter (where has_100_20)::integer as provider_100_20_count,
|
||||
count(*) filter (where has_business)::integer as business_provider_count,
|
||||
count(*) filter (where has_business_fiber)::integer as business_fiber_provider_count,
|
||||
count(*) filter (where has_business_100_20)::integer as business_100_20_provider_count,
|
||||
max(max_advertised_download_mbps) as max_advertised_download_mbps,
|
||||
max(max_advertised_upload_mbps) as max_advertised_upload_mbps,
|
||||
sum(matched_location_rows)::bigint as matched_location_rows,
|
||||
sum(provider_file_rows)::bigint as provider_file_rows
|
||||
from per_provider
|
||||
group by 1, 2, 3
|
||||
""",
|
||||
(as_of_date,),
|
||||
)
|
||||
return cur.rowcount
|
||||
|
||||
|
||||
def update_connection_table(cur, as_of_date: date) -> int:
|
||||
cur.execute(
|
||||
f"""
|
||||
with joined as (
|
||||
select
|
||||
c.master_id,
|
||||
coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11)) as provider_tract_geoid,
|
||||
coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5)) as provider_county_geoid,
|
||||
county.geoid as county_geoid,
|
||||
tract.geoid as tract_geoid,
|
||||
county.provider_count as county_provider_count,
|
||||
county.fiber_provider_count as county_fiber_provider_count,
|
||||
county.cable_provider_count as county_cable_provider_count,
|
||||
county.fixed_wireless_provider_count as county_fixed_wireless_provider_count,
|
||||
county.provider_100_20_count as county_100_20_provider_count,
|
||||
county.business_provider_count as county_business_provider_count,
|
||||
county.business_fiber_provider_count as county_business_fiber_provider_count,
|
||||
county.business_100_20_provider_count as county_business_100_20_provider_count,
|
||||
county.max_advertised_download_mbps as county_max_down,
|
||||
county.max_advertised_upload_mbps as county_max_up,
|
||||
tract.provider_count as tract_provider_count,
|
||||
tract.fiber_provider_count as tract_fiber_provider_count,
|
||||
tract.cable_provider_count as tract_cable_provider_count,
|
||||
tract.fixed_wireless_provider_count as tract_fixed_wireless_provider_count,
|
||||
tract.provider_100_20_count as tract_100_20_provider_count,
|
||||
tract.business_provider_count as tract_business_provider_count,
|
||||
tract.business_fiber_provider_count as tract_business_fiber_provider_count,
|
||||
tract.business_100_20_provider_count as tract_business_100_20_provider_count,
|
||||
tract.max_advertised_download_mbps as tract_max_down,
|
||||
tract.max_advertised_upload_mbps as tract_max_up
|
||||
from {CONNECTION_TABLE} c
|
||||
left join {CROSSWALK_TABLE} x
|
||||
on x.source_geography_type = 'Tract'
|
||||
and x.fcc_geography_type = 'Tract'
|
||||
and x.source_geoid = c.census_tract_geoid
|
||||
left join {AGG_TABLE} county
|
||||
on county.as_of_date = %s
|
||||
and county.geography_type = 'County'
|
||||
and county.geoid = coalesce(left(x.fcc_geoid, 5), left(c.census_tract_geoid, 5))
|
||||
left join {AGG_TABLE} tract
|
||||
on tract.as_of_date = %s
|
||||
and tract.geography_type = 'Tract'
|
||||
and tract.geoid = coalesce(x.fcc_geoid, left(c.census_tract_geoid, 11))
|
||||
)
|
||||
update {CONNECTION_TABLE} c
|
||||
set
|
||||
fcc_provider_geography_type = case
|
||||
when j.tract_geoid is not null then 'Tract'
|
||||
when j.county_geoid is not null then 'County'
|
||||
else c.fcc_provider_geography_type
|
||||
end,
|
||||
fcc_provider_geoid = coalesce(j.tract_geoid, j.county_geoid, c.fcc_provider_geoid),
|
||||
fcc_provider_count = coalesce(j.tract_provider_count, j.county_provider_count),
|
||||
fcc_fiber_provider_count = coalesce(j.tract_fiber_provider_count, j.county_fiber_provider_count),
|
||||
fcc_cable_provider_count = coalesce(j.tract_cable_provider_count, j.county_cable_provider_count),
|
||||
fcc_fixed_wireless_provider_count = coalesce(j.tract_fixed_wireless_provider_count, j.county_fixed_wireless_provider_count),
|
||||
fcc_100_20_provider_count = coalesce(j.tract_100_20_provider_count, j.county_100_20_provider_count),
|
||||
fcc_max_advertised_download_mbps = coalesce(j.tract_max_down, j.county_max_down, c.fcc_max_advertised_download_mbps),
|
||||
fcc_max_advertised_upload_mbps = coalesce(j.tract_max_up, j.county_max_up, c.fcc_max_advertised_upload_mbps),
|
||||
fcc_county_provider_count = j.county_provider_count,
|
||||
fcc_county_fiber_provider_count = j.county_fiber_provider_count,
|
||||
fcc_county_cable_provider_count = j.county_cable_provider_count,
|
||||
fcc_county_fixed_wireless_provider_count = j.county_fixed_wireless_provider_count,
|
||||
fcc_county_100_20_provider_count = j.county_100_20_provider_count,
|
||||
fcc_county_business_provider_count = j.county_business_provider_count,
|
||||
fcc_county_business_fiber_provider_count = j.county_business_fiber_provider_count,
|
||||
fcc_county_business_100_20_provider_count = j.county_business_100_20_provider_count,
|
||||
fcc_county_max_advertised_download_mbps = j.county_max_down,
|
||||
fcc_county_max_advertised_upload_mbps = j.county_max_up,
|
||||
fcc_tract_provider_count = j.tract_provider_count,
|
||||
fcc_tract_fiber_provider_count = j.tract_fiber_provider_count,
|
||||
fcc_tract_cable_provider_count = j.tract_cable_provider_count,
|
||||
fcc_tract_fixed_wireless_provider_count = j.tract_fixed_wireless_provider_count,
|
||||
fcc_tract_100_20_provider_count = j.tract_100_20_provider_count,
|
||||
fcc_tract_business_provider_count = j.tract_business_provider_count,
|
||||
fcc_tract_business_fiber_provider_count = j.tract_business_fiber_provider_count,
|
||||
fcc_tract_business_100_20_provider_count = j.tract_business_100_20_provider_count,
|
||||
fcc_tract_max_advertised_download_mbps = j.tract_max_down,
|
||||
fcc_tract_max_advertised_upload_mbps = j.tract_max_up,
|
||||
fcc_summary_json = jsonb_set(
|
||||
coalesce(c.fcc_summary_json, '{{}}'::jsonb),
|
||||
'{{location_provider_aggregate}}',
|
||||
jsonb_build_object(
|
||||
'source', 'fcc_state_location_coverage',
|
||||
'as_of_date', %s::text,
|
||||
'preferred_geography_type', case
|
||||
when j.tract_geoid is not null then 'Tract'
|
||||
when j.county_geoid is not null then 'County'
|
||||
else null
|
||||
end,
|
||||
'preferred_geoid', coalesce(j.tract_geoid, j.county_geoid),
|
||||
'county_geoid', j.county_geoid,
|
||||
'tract_geoid', j.tract_geoid
|
||||
),
|
||||
true
|
||||
),
|
||||
fcc_bdc_status = case
|
||||
when coalesce(j.tract_geoid, j.county_geoid) is not null then 'fcc_location_provider_joined'
|
||||
else c.fcc_bdc_status
|
||||
end,
|
||||
updated_at = now()
|
||||
from joined j
|
||||
where c.master_id = j.master_id
|
||||
and coalesce(j.tract_geoid, j.county_geoid) is not null
|
||||
""",
|
||||
(as_of_date, as_of_date, as_of_date),
|
||||
)
|
||||
return cur.rowcount
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--as-of-date", help="FCC availability as-of date; defaults to latest catalog date.")
|
||||
parser.add_argument("--states", nargs="*", help="Optional state FIPS list, e.g. 11 34 51.")
|
||||
parser.add_argument("--technology-codes", nargs="*", default=list(TERRESTRIAL_TECHNOLOGY_CODES))
|
||||
parser.add_argument("--limit-files", type=int, help="Process only the first N matching files.")
|
||||
parser.add_argument("--chunksize", type=int, default=500_000)
|
||||
parser.add_argument("--refresh", action="store_true", help="Delete existing location-provider rows for this as-of date first.")
|
||||
parser.add_argument("--no-resume", action="store_true", help="Reprocess files even if marked complete.")
|
||||
parser.add_argument("--no-update-connection", action="store_true", help="Build aggregate tables but do not update data_center_broadband_connection.")
|
||||
args = parser.parse_args()
|
||||
|
||||
load_zsh_secrets()
|
||||
require_env(["PGWEB_HOST", "PGWEB_PORT", "PGWEB_USER", "PGWEB_PASSWORD"])
|
||||
|
||||
as_of_date = parse_date(args.as_of_date) if args.as_of_date else None
|
||||
if as_of_date is None and args.as_of_date:
|
||||
raise RuntimeError(f"Invalid --as-of-date: {args.as_of_date}")
|
||||
|
||||
technology_codes = normalize_codes(args.technology_codes)
|
||||
requested_states = tuple(s.zfill(2) for s in args.states) if args.states else None
|
||||
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
create_tables(cur)
|
||||
seed_geoid_crosswalk(cur)
|
||||
as_of_date = as_of_date or latest_catalog_date(cur)
|
||||
states, counties, tracts = target_geographies(cur, requested_states)
|
||||
if not states:
|
||||
raise RuntimeError("No target data-center states found.")
|
||||
if args.refresh:
|
||||
cur.execute(f"delete from {DETAIL_TABLE} where as_of_date = %s", (as_of_date,))
|
||||
cur.execute(f"delete from {AGG_TABLE} where as_of_date = %s", (as_of_date,))
|
||||
cur.execute(f"delete from {PROGRESS_TABLE} where as_of_date = %s", (as_of_date,))
|
||||
files = catalog_files(cur, as_of_date, states, technology_codes, args.limit_files)
|
||||
conn.commit()
|
||||
|
||||
print(f"FCC as_of_date: {as_of_date}")
|
||||
print(f"Target states: {len(states):,} | counties: {len(counties):,} | tracts: {len(tracts):,}")
|
||||
print(f"Location coverage files selected: {len(files):,}")
|
||||
|
||||
total_matched_rows = 0
|
||||
total_provider_geo_rows = 0
|
||||
with tempfile.TemporaryDirectory(prefix="fcc_bdc_location_") as temp:
|
||||
temp_dir = Path(temp)
|
||||
for idx, file_row in enumerate(files, start=1):
|
||||
file_id = file_row["file_id"]
|
||||
with conn.cursor() as cur:
|
||||
skip = (not args.no_resume) and progress_done(cur, as_of_date, file_id)
|
||||
if skip:
|
||||
print(f"[{idx:,}/{len(files):,}] skip file_id={file_id} already processed")
|
||||
continue
|
||||
|
||||
print(
|
||||
f"[{idx:,}/{len(files):,}] file_id={file_id} state={file_row['state_fips']} "
|
||||
f"tech={file_row['technology_code']} records={file_row['record_count']:,}"
|
||||
)
|
||||
matched_rows, provider_geo_rows = process_file(
|
||||
conn,
|
||||
file_row,
|
||||
as_of_date,
|
||||
counties,
|
||||
tracts,
|
||||
args.chunksize,
|
||||
temp_dir,
|
||||
)
|
||||
total_matched_rows += matched_rows
|
||||
total_provider_geo_rows += provider_geo_rows
|
||||
print(
|
||||
f" complete file_id={file_id}: matched row-events={matched_rows:,}, "
|
||||
f"provider-geography rows={provider_geo_rows:,}"
|
||||
)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
agg_rows = rebuild_aggregate(cur, as_of_date)
|
||||
updated_rows = 0
|
||||
if not args.no_update_connection:
|
||||
updated_rows = update_connection_table(cur, as_of_date)
|
||||
conn.commit()
|
||||
|
||||
print(f"New matched row-events this run: {total_matched_rows:,}")
|
||||
print(f"New provider-geography detail rows this run: {total_provider_geo_rows:,}")
|
||||
print(f"{AGG_TABLE}: rebuilt {agg_rows:,} geography rows")
|
||||
if not args.no_update_connection:
|
||||
print(f"{CONNECTION_TABLE}: updated {updated_rows:,} rows with location-provider aggregates")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
258
scripts/build_master_data_centers.py
Normal file
258
scripts/build_master_data_centers.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build (or refresh) public.master_data_centers by merging:
|
||||
- public.us_dc_sample_geocoded (curated, attribute-rich)
|
||||
- public.osm_data_centers (OpenStreetMap features)
|
||||
|
||||
Deduplication rule (curated row wins):
|
||||
Step 1: for each curated row, find a matching OSM row by
|
||||
curated.id = osm.osm_id::text OR
|
||||
curated.nominatim_osm_id = osm.osm_id OR
|
||||
ST_DWithin(curated.geom, osm.geom, 150 m, geography)
|
||||
(closest match by sphere distance when multiple).
|
||||
Step 2: insert every curated row into master, filling NULLs from the
|
||||
matched OSM row when present. source = 'merged' if matched,
|
||||
otherwise 'curated'.
|
||||
Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
|
||||
source = 'osm'.
|
||||
|
||||
Result: every curated row appears once; OSM-only rows appear once; no row is
|
||||
emitted twice. The merge logic lives in a SQL function
|
||||
public.refresh_master_data_centers() so subsequent refreshes are one call.
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
import psycopg2
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
MASTER_TABLE = "public.master_data_centers"
|
||||
CURATED_TABLE = "public.us_dc_sample_geocoded"
|
||||
OSM_TABLE = "public.osm_data_centers"
|
||||
MATCH_RADIUS_M = 150
|
||||
|
||||
|
||||
CREATE_TABLE_SQL = f"""
|
||||
create table if not exists {MASTER_TABLE} (
|
||||
master_id text primary key,
|
||||
source text not null check (source in ('curated','osm','merged')),
|
||||
curated_id text,
|
||||
osm_id text,
|
||||
name text,
|
||||
operator text,
|
||||
street_address text,
|
||||
city text,
|
||||
state text,
|
||||
postal_code text,
|
||||
country text,
|
||||
website text,
|
||||
phone text,
|
||||
power_mw numeric,
|
||||
area_sqft integer,
|
||||
nearest_airport_miles numeric,
|
||||
has_bare_metal boolean,
|
||||
has_iaas boolean,
|
||||
has_internet_exchange boolean,
|
||||
has_colocation boolean,
|
||||
certifications text,
|
||||
content_summary text,
|
||||
osm_tags jsonb,
|
||||
matched_osm_tag_passes text[],
|
||||
match_method text,
|
||||
match_distance_m numeric,
|
||||
longitude double precision not null,
|
||||
latitude double precision not null,
|
||||
geom geometry(Point, 4326)
|
||||
generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||
);
|
||||
create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
|
||||
create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
|
||||
create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
|
||||
create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
|
||||
create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
|
||||
"""
|
||||
|
||||
|
||||
REFRESH_FUNCTION_SQL = f"""
|
||||
create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
|
||||
returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
|
||||
language plpgsql
|
||||
as $$
|
||||
begin
|
||||
truncate table {MASTER_TABLE};
|
||||
|
||||
-- pick a single best OSM match for each curated row, prioritizing ID
|
||||
-- equality, then nominatim id, then closest within radius
|
||||
create temporary table _curated_to_osm on commit drop as
|
||||
with ranked as (
|
||||
select
|
||||
c.id as curated_id,
|
||||
o.id as osm_id,
|
||||
case
|
||||
when c.id = o.osm_id::text then 'id'
|
||||
when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
|
||||
else 'spatial'
|
||||
end as method,
|
||||
ST_DistanceSphere(c.geom, o.geom) as dist_m,
|
||||
row_number() over (
|
||||
partition by c.id
|
||||
order by
|
||||
case
|
||||
when c.id = o.osm_id::text then 0
|
||||
when c.nominatim_osm_id = o.osm_id then 1
|
||||
else 2
|
||||
end,
|
||||
ST_DistanceSphere(c.geom, o.geom) asc
|
||||
) as rn
|
||||
from {CURATED_TABLE} c
|
||||
join {OSM_TABLE} o
|
||||
on c.id = o.osm_id::text
|
||||
or c.nominatim_osm_id = o.osm_id
|
||||
or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
|
||||
)
|
||||
select curated_id, osm_id, method, dist_m
|
||||
from ranked
|
||||
where rn = 1;
|
||||
|
||||
-- Step 1+2: insert curated rows (with OSM nulls filled where matched)
|
||||
insert into {MASTER_TABLE} (
|
||||
master_id, source, curated_id, osm_id,
|
||||
name, operator, street_address, city, state, postal_code, country,
|
||||
website, phone, power_mw, area_sqft, nearest_airport_miles,
|
||||
has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
|
||||
certifications, content_summary,
|
||||
osm_tags, matched_osm_tag_passes,
|
||||
match_method, match_distance_m,
|
||||
longitude, latitude
|
||||
)
|
||||
select
|
||||
'curated/' || c.id,
|
||||
case when m.osm_id is not null then 'merged' else 'curated' end,
|
||||
c.id,
|
||||
m.osm_id,
|
||||
coalesce(c.facility_name, o.name),
|
||||
coalesce(c.provider, o.operator),
|
||||
coalesce(c.street_address, o.street_address),
|
||||
coalesce(c.city, o.city),
|
||||
coalesce(c.state_code, o.state),
|
||||
coalesce(c.postal_code, o.postal_code),
|
||||
coalesce(c.country, o.country),
|
||||
coalesce(c.url, o.website),
|
||||
coalesce(c.phone, o.phone),
|
||||
c.power_mw,
|
||||
c.area_sqft,
|
||||
c.nearest_airport_miles,
|
||||
c.has_bare_metal,
|
||||
c.has_iaas,
|
||||
c.has_internet_exchange,
|
||||
c.has_colocation,
|
||||
c.certifications,
|
||||
c.content_summary,
|
||||
o.tags,
|
||||
o.matched_tags,
|
||||
m.method,
|
||||
round(m.dist_m::numeric, 2),
|
||||
c.longitude,
|
||||
c.latitude
|
||||
from {CURATED_TABLE} c
|
||||
left join _curated_to_osm m on m.curated_id = c.id
|
||||
left join {OSM_TABLE} o on o.id = m.osm_id;
|
||||
|
||||
-- Step 3: insert OSM rows that no curated row claimed
|
||||
insert into {MASTER_TABLE} (
|
||||
master_id, source, curated_id, osm_id,
|
||||
name, operator, street_address, city, state, postal_code, country,
|
||||
website, phone,
|
||||
osm_tags, matched_osm_tag_passes,
|
||||
longitude, latitude
|
||||
)
|
||||
select
|
||||
'osm/' || o.id,
|
||||
'osm',
|
||||
null,
|
||||
o.id,
|
||||
o.name,
|
||||
o.operator,
|
||||
o.street_address,
|
||||
o.city,
|
||||
o.state,
|
||||
o.postal_code,
|
||||
o.country,
|
||||
o.website,
|
||||
o.phone,
|
||||
o.tags,
|
||||
o.matched_tags,
|
||||
o.longitude,
|
||||
o.latitude
|
||||
from {OSM_TABLE} o
|
||||
where not exists (
|
||||
select 1 from _curated_to_osm m where m.osm_id = o.id
|
||||
);
|
||||
|
||||
analyze {MASTER_TABLE};
|
||||
|
||||
return query
|
||||
select
|
||||
count(*) filter (where source = 'curated'),
|
||||
count(*) filter (where source = 'merged'),
|
||||
count(*) filter (where source = 'osm'),
|
||||
count(*)
|
||||
from {MASTER_TABLE};
|
||||
end;
|
||||
$$;
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--radius-m",
|
||||
type=float,
|
||||
default=MATCH_RADIUS_M,
|
||||
help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--recreate",
|
||||
action="store_true",
|
||||
help=f"Drop and recreate {MASTER_TABLE} before building.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create extension if not exists postgis")
|
||||
if args.recreate:
|
||||
cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
|
||||
cur.execute(CREATE_TABLE_SQL)
|
||||
cur.execute(REFRESH_FUNCTION_SQL)
|
||||
cur.execute(
|
||||
"select * from public.refresh_master_data_centers(%s)",
|
||||
(args.radius_m,),
|
||||
)
|
||||
curated, merged, osm_only, total = cur.fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print(f"master_data_centers refreshed (radius={args.radius_m} m):")
|
||||
print(f" curated-only rows: {curated}")
|
||||
print(f" merged rows (curated + OSM): {merged}")
|
||||
print(f" osm-only rows: {osm_only}")
|
||||
print(f" total: {total}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
184
scripts/build_watershed_huc8_tables.py
Normal file
184
scripts/build_watershed_huc8_tables.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
TRACT_TABLE = "public.data_center_census_tracts_2024"
|
||||
STAGE_TABLE = "public._watershed_huc8_stage"
|
||||
HUC8_TABLE = "public.watershed_huc8"
|
||||
LINK_TABLE = "public.census_tract_huc8_link"
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def import_huc8_shapefile(shapefile_path):
|
||||
conn_str = (
|
||||
f"PG:host={os.environ['PGWEB_HOST']} "
|
||||
f"port={os.environ['PGWEB_PORT']} "
|
||||
f"user={os.environ['PGWEB_USER']} "
|
||||
f"password={os.environ['PGWEB_PASSWORD']} "
|
||||
f"dbname={DB_NAME}"
|
||||
)
|
||||
source = str(shapefile_path.resolve())
|
||||
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f",
|
||||
"PostgreSQL",
|
||||
conn_str,
|
||||
source,
|
||||
"-nln",
|
||||
STAGE_TABLE,
|
||||
"-nlt",
|
||||
"MULTIPOLYGON",
|
||||
"-t_srs",
|
||||
"EPSG:4326",
|
||||
"-lco",
|
||||
"GEOMETRY_NAME=geom",
|
||||
"-lco",
|
||||
"FID=gid",
|
||||
"-lco",
|
||||
"PRECISION=NO",
|
||||
"-unsetFieldWidth",
|
||||
"-skipfailures",
|
||||
"-overwrite",
|
||||
]
|
||||
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
|
||||
def build_final_tables(conn):
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f"drop table if exists {HUC8_TABLE}")
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {HUC8_TABLE} as
|
||||
select distinct on (huc8)
|
||||
huc8,
|
||||
name,
|
||||
states,
|
||||
areaacres,
|
||||
areasqkm,
|
||||
loaddate,
|
||||
sourceorig as sourceoriginator,
|
||||
sourcedata as sourcedatadesc,
|
||||
sourcefeat as sourcefeatureid,
|
||||
metasource as metasourceid,
|
||||
tnmid,
|
||||
geom::geometry(MultiPolygon, 4326) as geom
|
||||
from {STAGE_TABLE}
|
||||
where huc8 is not null
|
||||
order by huc8, loaddate desc nulls last
|
||||
"""
|
||||
)
|
||||
cur.execute(f"alter table {HUC8_TABLE} add primary key (huc8)")
|
||||
cur.execute(
|
||||
f"create index watershed_huc8_geom_gix on {HUC8_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index watershed_huc8_states_idx on {HUC8_TABLE} (states)"
|
||||
)
|
||||
|
||||
cur.execute(f"drop table if exists {LINK_TABLE}")
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {LINK_TABLE} as
|
||||
select
|
||||
geoid,
|
||||
huc8,
|
||||
overlap_sqm,
|
||||
overlap_sqm / 1000000.0 as overlap_sqkm,
|
||||
overlap_sqm / nullif(tract_sqm, 0.0) as tract_overlap_pct
|
||||
from (
|
||||
select
|
||||
tr.geoid,
|
||||
wh.huc8,
|
||||
st_area(
|
||||
st_intersection(
|
||||
tr.geom::geography,
|
||||
wh.geom::geography
|
||||
)
|
||||
) as overlap_sqm,
|
||||
st_area(tr.geom::geography) as tract_sqm
|
||||
from {TRACT_TABLE} tr
|
||||
join {HUC8_TABLE} wh
|
||||
on st_intersects(tr.geom, wh.geom)
|
||||
) as overlap_rows
|
||||
where overlap_sqm > 0
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index census_tract_huc8_link_geoid_idx on {LINK_TABLE} (geoid)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index census_tract_huc8_link_huc8_idx on {LINK_TABLE} (huc8)"
|
||||
)
|
||||
|
||||
cur.execute(f"analyze {STAGE_TABLE}")
|
||||
cur.execute(f"analyze {HUC8_TABLE}")
|
||||
cur.execute(f"analyze {LINK_TABLE}")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Build watershed HUC8 boundaries and GEOID linkage tables from "
|
||||
"a local HUC8 shapefile."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shapefile",
|
||||
default="HUC8_CONUS/HUC8_US.shp",
|
||||
help="Path to the HUC8 shapefile to import.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--build-only",
|
||||
action="store_true",
|
||||
help="Skip imports and rebuild final/link tables from existing stage data.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
shapefile_path = Path(args.shapefile)
|
||||
if not args.build_only and not shapefile_path.exists():
|
||||
raise FileNotFoundError(f"shapefile not found: {shapefile_path}")
|
||||
|
||||
if not args.build_only:
|
||||
print(f"importing HUC8 shapefile from {shapefile_path}")
|
||||
import_huc8_shapefile(shapefile_path)
|
||||
|
||||
conn = connect()
|
||||
try:
|
||||
build_final_tables(conn)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f"select count(*) from {HUC8_TABLE}")
|
||||
huc8_rows = cur.fetchone()[0]
|
||||
cur.execute(f"select count(*) from {LINK_TABLE}")
|
||||
link_rows = cur.fetchone()[0]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print(
|
||||
f"done: source={shapefile_path}, huc8_rows={huc8_rows}, "
|
||||
f"geoid_huc8_links={link_rows}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
731
scripts/create_data_center_census_tract_table.py
Normal file
731
scripts/create_data_center_census_tract_table.py
Normal file
@@ -0,0 +1,731 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
POINT_TABLE = "public.master_data_centers"
|
||||
POINT_ID_COL = "master_id"
|
||||
BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
|
||||
ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
|
||||
FINAL_TABLE = "public.data_center_census_tracts_2024"
|
||||
|
||||
ACS_YEAR = 2024
|
||||
ACS_SOURCE = "ACS 2024 5-year profile"
|
||||
TRACT_ZIP = Path("cb_2024_us_tract_500k.zip")
|
||||
TRACT_ZIP_URL = (
|
||||
"https://www2.census.gov/geo/tiger/GENZ2024/shp/cb_2024_us_tract_500k.zip"
|
||||
)
|
||||
ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
|
||||
|
||||
STATE_NAME_TO_CODE = {
|
||||
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
|
||||
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
|
||||
"District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
|
||||
"Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
|
||||
"Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
|
||||
"Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
|
||||
"Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
|
||||
"Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
|
||||
"New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
|
||||
"Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
|
||||
"South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
|
||||
"Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
|
||||
"West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
|
||||
"American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
|
||||
"Puerto Rico": "PR", "United States Virgin Islands": "VI",
|
||||
"U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
|
||||
}
|
||||
|
||||
STATE_FIPS = {
|
||||
"AL": "01",
|
||||
"AK": "02",
|
||||
"AZ": "04",
|
||||
"AR": "05",
|
||||
"CA": "06",
|
||||
"CO": "08",
|
||||
"CT": "09",
|
||||
"DE": "10",
|
||||
"DC": "11",
|
||||
"FL": "12",
|
||||
"GA": "13",
|
||||
"HI": "15",
|
||||
"ID": "16",
|
||||
"IL": "17",
|
||||
"IN": "18",
|
||||
"IA": "19",
|
||||
"KS": "20",
|
||||
"KY": "21",
|
||||
"LA": "22",
|
||||
"ME": "23",
|
||||
"MD": "24",
|
||||
"MA": "25",
|
||||
"MI": "26",
|
||||
"MN": "27",
|
||||
"MS": "28",
|
||||
"MO": "29",
|
||||
"MT": "30",
|
||||
"NE": "31",
|
||||
"NV": "32",
|
||||
"NH": "33",
|
||||
"NJ": "34",
|
||||
"NM": "35",
|
||||
"NY": "36",
|
||||
"NC": "37",
|
||||
"ND": "38",
|
||||
"OH": "39",
|
||||
"OK": "40",
|
||||
"OR": "41",
|
||||
"PA": "42",
|
||||
"RI": "44",
|
||||
"SC": "45",
|
||||
"SD": "46",
|
||||
"TN": "47",
|
||||
"TX": "48",
|
||||
"UT": "49",
|
||||
"VT": "50",
|
||||
"VA": "51",
|
||||
"WA": "53",
|
||||
"WV": "54",
|
||||
"WI": "55",
|
||||
"WY": "56",
|
||||
"AS": "60",
|
||||
"GU": "66",
|
||||
"MP": "69",
|
||||
"PR": "72",
|
||||
"VI": "78",
|
||||
}
|
||||
|
||||
ACS_VARIABLES = {
|
||||
"DP05_0001E": "population",
|
||||
"DP05_0018E": "median_age",
|
||||
"DP02_0001E": "households",
|
||||
"DP02_0016E": "avg_household_size",
|
||||
"DP02_0067PE": "high_school_or_higher_pct",
|
||||
"DP02_0068PE": "bachelor_or_higher_pct",
|
||||
"DP02_0154PE": "broadband_subscription_pct",
|
||||
"DP03_0001E": "population_16_over",
|
||||
"DP03_0002E": "labor_force",
|
||||
"DP03_0005E": "unemployed",
|
||||
"DP03_0009PE": "unemployment_rate",
|
||||
"DP03_0032E": "industry_total_workers",
|
||||
"DP03_0033E": "industry_agriculture_mining_workers",
|
||||
"DP03_0034E": "industry_construction_workers",
|
||||
"DP03_0035E": "industry_manufacturing_workers",
|
||||
"DP03_0036E": "industry_wholesale_trade_workers",
|
||||
"DP03_0037E": "industry_retail_trade_workers",
|
||||
"DP03_0038E": "industry_transportation_warehousing_utilities_workers",
|
||||
"DP03_0039E": "industry_information_workers",
|
||||
"DP03_0040E": "industry_finance_real_estate_workers",
|
||||
"DP03_0041E": "industry_professional_management_admin_workers",
|
||||
"DP03_0042E": "industry_education_health_social_workers",
|
||||
"DP03_0043E": "industry_arts_entertainment_food_workers",
|
||||
"DP03_0044E": "industry_other_services_workers",
|
||||
"DP03_0045E": "industry_public_administration_workers",
|
||||
"DP03_0062E": "median_household_income",
|
||||
"DP03_0088E": "per_capita_income",
|
||||
"DP03_0119PE": "family_poverty_rate",
|
||||
"DP03_0128PE": "poverty_rate",
|
||||
"DP05_0090E": "hispanic_latino_population",
|
||||
"DP05_0090PE": "hispanic_latino_pct",
|
||||
"DP05_0096E": "non_hispanic_white_population",
|
||||
"DP05_0096PE": "non_hispanic_white_pct",
|
||||
"DP05_0097E": "non_hispanic_black_population",
|
||||
"DP05_0097PE": "non_hispanic_black_pct",
|
||||
"DP05_0099E": "non_hispanic_asian_population",
|
||||
"DP05_0099PE": "non_hispanic_asian_pct",
|
||||
}
|
||||
|
||||
COUNT_COLUMNS = {
|
||||
"population",
|
||||
"households",
|
||||
"population_16_over",
|
||||
"labor_force",
|
||||
"unemployed",
|
||||
"industry_total_workers",
|
||||
"industry_agriculture_mining_workers",
|
||||
"industry_construction_workers",
|
||||
"industry_manufacturing_workers",
|
||||
"industry_wholesale_trade_workers",
|
||||
"industry_retail_trade_workers",
|
||||
"industry_transportation_warehousing_utilities_workers",
|
||||
"industry_information_workers",
|
||||
"industry_finance_real_estate_workers",
|
||||
"industry_professional_management_admin_workers",
|
||||
"industry_education_health_social_workers",
|
||||
"industry_arts_entertainment_food_workers",
|
||||
"industry_other_services_workers",
|
||||
"industry_public_administration_workers",
|
||||
"median_household_income",
|
||||
"per_capita_income",
|
||||
"hispanic_latino_population",
|
||||
"non_hispanic_white_population",
|
||||
"non_hispanic_black_population",
|
||||
"non_hispanic_asian_population",
|
||||
}
|
||||
|
||||
NUMERIC_COLUMNS = set(ACS_VARIABLES.values()) - COUNT_COLUMNS
|
||||
|
||||
INDUSTRY_COLUMNS = {
|
||||
"industry_agriculture_mining_workers": "Agriculture, forestry, fishing and hunting, and mining",
|
||||
"industry_construction_workers": "Construction",
|
||||
"industry_manufacturing_workers": "Manufacturing",
|
||||
"industry_wholesale_trade_workers": "Wholesale trade",
|
||||
"industry_retail_trade_workers": "Retail trade",
|
||||
"industry_transportation_warehousing_utilities_workers": "Transportation and warehousing, and utilities",
|
||||
"industry_information_workers": "Information",
|
||||
"industry_finance_real_estate_workers": "Finance and insurance, and real estate and rental and leasing",
|
||||
"industry_professional_management_admin_workers": "Professional, scientific, management, administrative, and waste management services",
|
||||
"industry_education_health_social_workers": "Educational services, and health care and social assistance",
|
||||
"industry_arts_entertainment_food_workers": "Arts, entertainment, recreation, accommodation, and food services",
|
||||
"industry_other_services_workers": "Other services, except public administration",
|
||||
"industry_public_administration_workers": "Public administration",
|
||||
}
|
||||
|
||||
SPECIAL_VALUES = {
|
||||
"-666666666",
|
||||
"-888888888",
|
||||
"-999999999",
|
||||
"-222222222",
|
||||
"-333333333",
|
||||
"-555555555",
|
||||
"-666666666.0",
|
||||
"-888888888.0",
|
||||
"-999999999.0",
|
||||
"-222222222.0",
|
||||
"-333333333.0",
|
||||
"-555555555.0",
|
||||
}
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def normalize_state(value):
|
||||
if value in (None, ""):
|
||||
return None
|
||||
if value in STATE_FIPS:
|
||||
return value
|
||||
return STATE_NAME_TO_CODE.get(value.strip())
|
||||
|
||||
|
||||
def get_state_fips(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
normalized_counts = {}
|
||||
null_state_count = 0
|
||||
unknown = []
|
||||
for raw, count in rows:
|
||||
if raw is None:
|
||||
null_state_count += count
|
||||
continue
|
||||
code = normalize_state(raw)
|
||||
if code is None:
|
||||
unknown.append((raw, count))
|
||||
continue
|
||||
normalized_counts[code] = normalized_counts.get(code, 0) + count
|
||||
if unknown:
|
||||
details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
|
||||
raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
|
||||
if null_state_count:
|
||||
print(
|
||||
f"warning: {null_state_count} master_data_centers rows have NULL state; "
|
||||
f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
|
||||
)
|
||||
# Census ACS 5-year DP profile lacks coverage for the small island territories;
|
||||
# restrict to the 50 states + DC + PR which the ACS profile reliably serves.
|
||||
allowed = {"AS", "GU", "MP", "VI"}
|
||||
return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
|
||||
return sorted({STATE_FIPS[code] for code in normalized_counts})
|
||||
|
||||
|
||||
def ensure_final_table_absent(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("select to_regclass(%s)", (FINAL_TABLE,))
|
||||
if cur.fetchone()[0] is not None:
|
||||
raise RuntimeError(
|
||||
f"Target table {FINAL_TABLE} already exists; refusing to overwrite it."
|
||||
)
|
||||
|
||||
|
||||
def drop_final_table_if_exists(conn):
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f"drop table if exists {FINAL_TABLE}")
|
||||
|
||||
|
||||
def download_tract_boundaries():
|
||||
if TRACT_ZIP.exists() and TRACT_ZIP.stat().st_size > 50_000_000:
|
||||
return
|
||||
tmp_path = TRACT_ZIP.with_suffix(".zip.part")
|
||||
with urllib.request.urlopen(TRACT_ZIP_URL, timeout=120) as response:
|
||||
with tmp_path.open("wb") as out:
|
||||
while True:
|
||||
chunk = response.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
out.write(chunk)
|
||||
tmp_path.rename(TRACT_ZIP)
|
||||
|
||||
|
||||
def import_tract_boundaries(state_fips):
|
||||
where = "STATEFP IN ({})".format(
|
||||
",".join(f"'{state}'" for state in sorted(state_fips))
|
||||
)
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"PGHOST": os.environ["PGWEB_HOST"],
|
||||
"PGPORT": os.environ["PGWEB_PORT"],
|
||||
"PGUSER": os.environ["PGWEB_USER"],
|
||||
"PGPASSWORD": os.environ["PGWEB_PASSWORD"],
|
||||
"PGDATABASE": DB_NAME,
|
||||
}
|
||||
)
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f",
|
||||
"PostgreSQL",
|
||||
"PG:dbname=data_centers",
|
||||
f"/vsizip/{TRACT_ZIP.resolve()}/cb_2024_us_tract_500k.shp",
|
||||
"-nln",
|
||||
BOUNDARY_STAGE_TABLE,
|
||||
"-overwrite",
|
||||
"-nlt",
|
||||
"MULTIPOLYGON",
|
||||
"-t_srs",
|
||||
"EPSG:4326",
|
||||
"-lco",
|
||||
"GEOMETRY_NAME=geom",
|
||||
"-lco",
|
||||
"FID=gid",
|
||||
"-where",
|
||||
where,
|
||||
]
|
||||
subprocess.run(cmd, check=True, env=env)
|
||||
|
||||
|
||||
def fetch_acs_for_state(state_fips):
|
||||
variables = ["NAME", *ACS_VARIABLES.keys()]
|
||||
params = {
|
||||
"get": ",".join(variables),
|
||||
"for": "tract:*",
|
||||
"in": f"state:{state_fips} county:*",
|
||||
}
|
||||
api_key = os.environ.get("CENSUS_API_KEY")
|
||||
if api_key:
|
||||
params["key"] = api_key
|
||||
url = (
|
||||
f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
|
||||
+ urllib.parse.urlencode(params)
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=120) as response:
|
||||
body = response.read().decode("utf-8")
|
||||
except urllib.error.HTTPError as exc:
|
||||
body = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(
|
||||
f"Census ACS request failed for state {state_fips}: HTTP {exc.code} — {body[:300]}"
|
||||
) from exc
|
||||
try:
|
||||
data = json.loads(body)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise RuntimeError(
|
||||
f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
|
||||
) from exc
|
||||
|
||||
header = data[0]
|
||||
rows = []
|
||||
for values in data[1:]:
|
||||
raw = dict(zip(header, values))
|
||||
row = {
|
||||
"geoid": raw["state"] + raw["county"] + raw["tract"],
|
||||
"acs_name": raw["NAME"],
|
||||
"statefp": raw["state"],
|
||||
"countyfp": raw["county"],
|
||||
"tractce": raw["tract"],
|
||||
}
|
||||
for acs_var, column in ACS_VARIABLES.items():
|
||||
row[column] = clean_acs_value(raw.get(acs_var), column)
|
||||
add_primary_industry(row)
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def clean_acs_value(value, column):
|
||||
if value in (None, "", "null") or value in SPECIAL_VALUES:
|
||||
return None
|
||||
if column in COUNT_COLUMNS:
|
||||
return int(Decimal(value))
|
||||
if column in NUMERIC_COLUMNS:
|
||||
return Decimal(value)
|
||||
return value
|
||||
|
||||
|
||||
def add_primary_industry(row):
|
||||
industry_total = row.get("industry_total_workers")
|
||||
best_column = None
|
||||
best_value = None
|
||||
for column in INDUSTRY_COLUMNS:
|
||||
value = row.get(column)
|
||||
if value is None:
|
||||
continue
|
||||
if best_value is None or value > best_value:
|
||||
best_column = column
|
||||
best_value = value
|
||||
|
||||
row["primary_industry"] = INDUSTRY_COLUMNS.get(best_column)
|
||||
row["primary_industry_workers"] = best_value
|
||||
if industry_total and best_value is not None:
|
||||
row["primary_industry_pct"] = Decimal(best_value * 100) / Decimal(industry_total)
|
||||
else:
|
||||
row["primary_industry_pct"] = None
|
||||
|
||||
|
||||
def fetch_acs(state_fips):
|
||||
rows = []
|
||||
for state in state_fips:
|
||||
rows.extend(fetch_acs_for_state(state))
|
||||
|
||||
fieldnames = [
|
||||
"geoid",
|
||||
"acs_name",
|
||||
"statefp",
|
||||
"countyfp",
|
||||
"tractce",
|
||||
*ACS_VARIABLES.values(),
|
||||
"primary_industry",
|
||||
"primary_industry_workers",
|
||||
"primary_industry_pct",
|
||||
]
|
||||
with ACS_AUDIT_CSV.open("w", newline="", encoding="utf-8") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return rows, fieldnames
|
||||
|
||||
|
||||
def load_acs_stage(conn, rows, fieldnames):
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f"drop table if exists {ACS_STAGE_TABLE}")
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {ACS_STAGE_TABLE} (
|
||||
geoid text primary key,
|
||||
acs_name text,
|
||||
statefp text,
|
||||
countyfp text,
|
||||
tractce text,
|
||||
population integer,
|
||||
median_age numeric,
|
||||
households integer,
|
||||
avg_household_size numeric,
|
||||
high_school_or_higher_pct numeric,
|
||||
bachelor_or_higher_pct numeric,
|
||||
broadband_subscription_pct numeric,
|
||||
population_16_over integer,
|
||||
labor_force integer,
|
||||
unemployed integer,
|
||||
unemployment_rate numeric,
|
||||
industry_total_workers integer,
|
||||
industry_agriculture_mining_workers integer,
|
||||
industry_construction_workers integer,
|
||||
industry_manufacturing_workers integer,
|
||||
industry_wholesale_trade_workers integer,
|
||||
industry_retail_trade_workers integer,
|
||||
industry_transportation_warehousing_utilities_workers integer,
|
||||
industry_information_workers integer,
|
||||
industry_finance_real_estate_workers integer,
|
||||
industry_professional_management_admin_workers integer,
|
||||
industry_education_health_social_workers integer,
|
||||
industry_arts_entertainment_food_workers integer,
|
||||
industry_other_services_workers integer,
|
||||
industry_public_administration_workers integer,
|
||||
median_household_income integer,
|
||||
per_capita_income integer,
|
||||
family_poverty_rate numeric,
|
||||
poverty_rate numeric,
|
||||
hispanic_latino_population integer,
|
||||
hispanic_latino_pct numeric,
|
||||
non_hispanic_white_population integer,
|
||||
non_hispanic_white_pct numeric,
|
||||
non_hispanic_black_population integer,
|
||||
non_hispanic_black_pct numeric,
|
||||
non_hispanic_asian_population integer,
|
||||
non_hispanic_asian_pct numeric,
|
||||
primary_industry text,
|
||||
primary_industry_workers integer,
|
||||
primary_industry_pct numeric
|
||||
)
|
||||
"""
|
||||
)
|
||||
values = [tuple(row.get(column) for column in fieldnames) for row in rows]
|
||||
execute_values(
|
||||
cur,
|
||||
f"insert into {ACS_STAGE_TABLE} ({', '.join(fieldnames)}) values %s",
|
||||
values,
|
||||
page_size=1000,
|
||||
)
|
||||
cur.execute(f"analyze {ACS_STAGE_TABLE}")
|
||||
|
||||
|
||||
def create_final_table(conn):
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("drop index if exists _dc_census_tract_boundaries_2024_geom_gix")
|
||||
cur.execute(
|
||||
f"create index _dc_census_tract_boundaries_2024_geom_gix on {BOUNDARY_STAGE_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(f"analyze {BOUNDARY_STAGE_TABLE}")
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {FINAL_TABLE} as
|
||||
with dc_tracts as (
|
||||
select
|
||||
t.geoid,
|
||||
count(*)::integer as data_center_count,
|
||||
count(*) filter (where dc.source = 'curated')::integer
|
||||
as curated_only_data_center_count,
|
||||
count(*) filter (where dc.source = 'merged')::integer
|
||||
as merged_data_center_count,
|
||||
count(*) filter (where dc.source = 'osm')::integer
|
||||
as osm_only_data_center_count,
|
||||
array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
|
||||
array_agg(distinct dc.operator) filter (where dc.operator is not null)
|
||||
as operators
|
||||
from {BOUNDARY_STAGE_TABLE} t
|
||||
join {POINT_TABLE} dc
|
||||
on t.geom && dc.geom
|
||||
and ST_Covers(t.geom, dc.geom)
|
||||
group by t.geoid
|
||||
)
|
||||
select
|
||||
t.geoid,
|
||||
t.statefp,
|
||||
t.countyfp,
|
||||
t.tractce,
|
||||
t.name as tract_name,
|
||||
t.namelsad,
|
||||
t.aland::bigint as land_area_sqm,
|
||||
t.awater::bigint as water_area_sqm,
|
||||
{ACS_YEAR}::integer as acs_year,
|
||||
'{ACS_SOURCE}'::text as acs_source,
|
||||
a.acs_name,
|
||||
d.data_center_count,
|
||||
d.curated_only_data_center_count,
|
||||
d.merged_data_center_count,
|
||||
d.osm_only_data_center_count,
|
||||
d.data_center_ids,
|
||||
d.operators,
|
||||
a.population,
|
||||
a.median_age,
|
||||
a.households,
|
||||
a.avg_household_size,
|
||||
a.high_school_or_higher_pct,
|
||||
a.bachelor_or_higher_pct,
|
||||
a.broadband_subscription_pct,
|
||||
a.population_16_over,
|
||||
a.labor_force,
|
||||
a.unemployed,
|
||||
a.unemployment_rate,
|
||||
a.median_household_income,
|
||||
a.per_capita_income,
|
||||
a.family_poverty_rate,
|
||||
a.poverty_rate,
|
||||
a.hispanic_latino_population,
|
||||
a.hispanic_latino_pct,
|
||||
a.non_hispanic_white_population,
|
||||
a.non_hispanic_white_pct,
|
||||
a.non_hispanic_black_population,
|
||||
a.non_hispanic_black_pct,
|
||||
a.non_hispanic_asian_population,
|
||||
a.non_hispanic_asian_pct,
|
||||
a.industry_total_workers,
|
||||
a.industry_agriculture_mining_workers,
|
||||
a.industry_construction_workers,
|
||||
a.industry_manufacturing_workers,
|
||||
a.industry_wholesale_trade_workers,
|
||||
a.industry_retail_trade_workers,
|
||||
a.industry_transportation_warehousing_utilities_workers,
|
||||
a.industry_information_workers,
|
||||
a.industry_finance_real_estate_workers,
|
||||
a.industry_professional_management_admin_workers,
|
||||
a.industry_education_health_social_workers,
|
||||
a.industry_arts_entertainment_food_workers,
|
||||
a.industry_other_services_workers,
|
||||
a.industry_public_administration_workers,
|
||||
a.primary_industry,
|
||||
a.primary_industry_workers,
|
||||
a.primary_industry_pct,
|
||||
t.geom::geometry(MultiPolygon, 4326) as geom
|
||||
from {BOUNDARY_STAGE_TABLE} t
|
||||
join dc_tracts d on d.geoid = t.geoid
|
||||
left join {ACS_STAGE_TABLE} a on a.geoid = t.geoid
|
||||
"""
|
||||
)
|
||||
cur.execute(f"alter table {FINAL_TABLE} add primary key (geoid)")
|
||||
cur.execute(
|
||||
f"create index data_center_census_tracts_2024_geom_gix on {FINAL_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index data_center_census_tracts_2024_state_county_idx on {FINAL_TABLE} (statefp, countyfp)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index data_center_census_tracts_2024_dc_count_idx on {FINAL_TABLE} (data_center_count desc)"
|
||||
)
|
||||
cur.execute(
|
||||
f"""
|
||||
comment on table {FINAL_TABLE} is
|
||||
'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
|
||||
"""
|
||||
)
|
||||
cur.execute(f"analyze {FINAL_TABLE}")
|
||||
|
||||
|
||||
def assign_point_geoids(conn):
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"alter table {POINT_TABLE} add column if not exists geoid text"
|
||||
)
|
||||
cur.execute(
|
||||
f"""
|
||||
update {POINT_TABLE} dc
|
||||
set geoid = matched.geoid
|
||||
from (
|
||||
select
|
||||
dc_inner.{POINT_ID_COL} as point_id,
|
||||
(
|
||||
select t.geoid
|
||||
from {BOUNDARY_STAGE_TABLE} t
|
||||
where t.geom && dc_inner.geom
|
||||
and st_covers(t.geom, dc_inner.geom)
|
||||
order by t.geoid
|
||||
limit 1
|
||||
) as geoid
|
||||
from {POINT_TABLE} dc_inner
|
||||
) matched
|
||||
where dc.{POINT_ID_COL} = matched.point_id
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
|
||||
)
|
||||
cur.execute(f"analyze {POINT_TABLE}")
|
||||
|
||||
|
||||
def validate(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
select
|
||||
count(*)::integer as tract_rows,
|
||||
coalesce(sum(data_center_count), 0)::integer as assigned_data_centers,
|
||||
count(*) filter (where geom is not null)::integer as geom_rows
|
||||
from {FINAL_TABLE}
|
||||
"""
|
||||
)
|
||||
summary = cur.fetchone()
|
||||
cur.execute(f"select count(*)::integer from {POINT_TABLE}")
|
||||
total_points = cur.fetchone()[0]
|
||||
cur.execute(
|
||||
f"""
|
||||
select source, count(*)::integer
|
||||
from {POINT_TABLE}
|
||||
group by source
|
||||
order by source
|
||||
"""
|
||||
)
|
||||
point_source_breakdown = cur.fetchall()
|
||||
cur.execute(
|
||||
f"""
|
||||
select count(*)::integer
|
||||
from {POINT_TABLE}
|
||||
where geoid is null
|
||||
"""
|
||||
)
|
||||
unassigned_points = cur.fetchone()[0]
|
||||
cur.execute(
|
||||
f"""
|
||||
select count(*)::integer
|
||||
from {FINAL_TABLE}
|
||||
where population is null
|
||||
"""
|
||||
)
|
||||
missing_acs = cur.fetchone()[0]
|
||||
return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build census-tract enrichment table for data-center points."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace-final",
|
||||
action="store_true",
|
||||
help="Drop and rebuild the final tract table if it already exists.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
conn = connect()
|
||||
try:
|
||||
state_fips = get_state_fips(conn)
|
||||
if args.replace_final:
|
||||
drop_final_table_if_exists(conn)
|
||||
else:
|
||||
ensure_final_table_absent(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
download_tract_boundaries()
|
||||
import_tract_boundaries(state_fips)
|
||||
acs_rows, acs_fieldnames = fetch_acs(state_fips)
|
||||
|
||||
conn = connect()
|
||||
try:
|
||||
if args.replace_final:
|
||||
drop_final_table_if_exists(conn)
|
||||
else:
|
||||
ensure_final_table_absent(conn)
|
||||
load_acs_stage(conn, acs_rows, acs_fieldnames)
|
||||
create_final_table(conn)
|
||||
assign_point_geoids(conn)
|
||||
summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print(f"loaded {len(acs_rows)} ACS tract rows into {ACS_STAGE_TABLE}")
|
||||
print(f"created {FINAL_TABLE}")
|
||||
print(
|
||||
"tract_rows={0} assigned_data_centers={1} geom_rows={2} source_points={3}".format(
|
||||
summary[0], summary[1], summary[2], total_points
|
||||
)
|
||||
)
|
||||
print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
|
||||
print(f"points_unassigned_to_tract={unassigned_points}")
|
||||
print(f"tracts_missing_acs_population={missing_acs}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1315
scripts/ingest_eia_energy_layers.py
Normal file
1315
scripts/ingest_eia_energy_layers.py
Normal file
File diff suppressed because it is too large
Load Diff
686
scripts/ingest_legiscan.py
Normal file
686
scripts/ingest_legiscan.py
Normal file
@@ -0,0 +1,686 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ingest LegiScan legislative datasets for all US states (2016-2026) into PostgreSQL.
|
||||
|
||||
Fetches all state session datasets from the LegiScan API, parses bill JSONs from
|
||||
each ZIP archive, and loads them into the data_centers PostgreSQL database. Bills are
|
||||
tagged with relevance categories (data_center, large_load, ratepayer_protection, etc.).
|
||||
|
||||
Usage:
|
||||
python ingest_legiscan.py [--all | --setup-db | --fetch | --load | --tag]
|
||||
[--state XX] [--year-start YYYY] [--dry-run] [--verbose]
|
||||
|
||||
Options:
|
||||
--all Run all phases in sequence
|
||||
--setup-db Create/update database tables and indexes
|
||||
--fetch Download dataset ZIPs for all states (uses hash caching)
|
||||
--load Parse cached ZIPs and insert/update bills in DB
|
||||
--tag (Re)apply relevance tagging to all loaded bills
|
||||
--state XX Restrict to one state (e.g., CA)
|
||||
--year-start N Earliest session year to include (default: 2016)
|
||||
--dry-run Print what would be done; no API calls or DB writes
|
||||
--verbose Extra progress output
|
||||
|
||||
Environment:
|
||||
LEGISCAN_API_KEY Required
|
||||
PGWEB_HOST, PGWEB_PORT,
|
||||
PGWEB_USER, PGWEB_PASSWORD PostgreSQL connection (DB: data_centers)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
API_KEY = os.environ.get("LEGISCAN_API_KEY")
|
||||
API_BASE = "https://api.legiscan.com/"
|
||||
CACHE_DIR = Path("data/legiscan_cache")
|
||||
MIN_YEAR_DEFAULT = 2016
|
||||
RATE_LIMIT_DELAY = 0.5 # seconds between API calls
|
||||
|
||||
# Keyword categories for relevance tagging.
|
||||
# Keys become the tag values stored in legiscan_bills.relevance_tags[].
|
||||
RELEVANCE_KEYWORDS: dict[str, list[str]] = {
|
||||
"data_center": [
|
||||
"data center", "data centre", "hyperscale", "colocation", "colo facility",
|
||||
"server farm", "cloud computing facility", "internet exchange",
|
||||
"carrier hotel", "artificial intelligence facility", "ai campus",
|
||||
"ai data center", "gpu cluster", "compute facility",
|
||||
"high performance computing", "hpc facility", "data hall",
|
||||
"network access point", "data warehousing facility",
|
||||
],
|
||||
"large_load": [
|
||||
"large load", "large power consumer", "large electricity consumer",
|
||||
"high electricity consumption", "high power consumption",
|
||||
"megawatt load", "gigawatt load", "cryptocurrency mining",
|
||||
"bitcoin mining", "blockchain mining", "crypto mining",
|
||||
"digital asset mining", "proof of work", "electric arc furnace",
|
||||
"large industrial customer", "high-density load", "new large load",
|
||||
"load growth", "extraordinary load",
|
||||
],
|
||||
"ratepayer_protection": [
|
||||
"ratepayer", "rate payer", "cost shift", "cost shifting",
|
||||
"cost allocation", "cross-subsidy", "cross subsidy",
|
||||
"rate design", "rate structure", "electricity rate",
|
||||
"electric rate", "utility rate", "rate increase", "rate burden",
|
||||
"rate base", "stranded cost", "rate class", "customer protection",
|
||||
"consumer protection", "electric customer", "residential customer",
|
||||
"demand charge", "transmission cost", "grid upgrade cost",
|
||||
"interconnection cost", "cost recovery", "rate relief",
|
||||
"affordability", "energy burden",
|
||||
],
|
||||
"grid_impact": [
|
||||
"grid reliability", "grid stability", "grid congestion",
|
||||
"grid modernization", "grid infrastructure", "electric grid",
|
||||
"power grid", "electricity grid", "transmission upgrade",
|
||||
"transmission expansion", "interconnection queue",
|
||||
"interconnection study", "demand response", "curtailment",
|
||||
"grid capacity", "system reliability", "capacity expansion",
|
||||
"electric system", "power system reliability", "grid resilience",
|
||||
"grid planning", "integrated resource plan",
|
||||
],
|
||||
"water_use": [
|
||||
"water consumption", "cooling water", "water efficiency",
|
||||
"water use effectiveness", "evaporative cooling",
|
||||
"water withdrawal", "water discharge", "water impact",
|
||||
"water footprint", "cooling tower", "water-cooled",
|
||||
"once-through cooling", "recycled water", "water stress",
|
||||
"water scarcity",
|
||||
],
|
||||
"tax_incentive": [
|
||||
"tax credit", "tax exemption", "tax abatement", "tax incentive",
|
||||
"sales tax exemption", "property tax exemption", "tax break",
|
||||
"tax relief", "enterprise zone", "economic incentive",
|
||||
"business incentive", "investment credit", "job creation credit",
|
||||
"economic development incentive", "opportunity zone",
|
||||
"tax subsidy",
|
||||
],
|
||||
"energy_policy": [
|
||||
"renewable energy", "clean energy", "energy efficiency",
|
||||
"power purchase agreement", " ppa ", "green tariff",
|
||||
"clean power", "carbon neutral", "net zero", "decarbonization",
|
||||
"energy procurement", "24/7 clean energy", "carbon-free",
|
||||
"clean electricity", "energy storage", "virtual power plant",
|
||||
"net metering", "green power",
|
||||
],
|
||||
"siting_permitting": [
|
||||
"conditional use permit", "special use permit", "land use permit",
|
||||
"zoning", "facility siting", "environmental review",
|
||||
"environmental impact", "noise ordinance", "setback requirement",
|
||||
"building permit", "construction permit", "site approval",
|
||||
"local approval", "permit requirement", "permitting process",
|
||||
"local control", "preemption",
|
||||
],
|
||||
}
|
||||
|
||||
# Status code labels (LegiScan)
|
||||
STATUS_LABELS = {
|
||||
0: "N/A", 1: "Introduced", 2: "Engrossed", 3: "Enrolled",
|
||||
4: "Passed", 5: "Vetoed", 6: "Failed", 7: "Override",
|
||||
8: "Chaptered", 9: "Referred", 10: "Report Pass",
|
||||
11: "Report DNP", 12: "Draft",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Database
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_db_connection():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
DDL = """
|
||||
CREATE TABLE IF NOT EXISTS legiscan_sessions (
|
||||
session_id INTEGER PRIMARY KEY,
|
||||
state_id INTEGER NOT NULL,
|
||||
state_abbr VARCHAR(2) NOT NULL,
|
||||
year_start INTEGER NOT NULL,
|
||||
year_end INTEGER NOT NULL,
|
||||
session_title TEXT,
|
||||
session_tag TEXT,
|
||||
is_special BOOLEAN DEFAULT FALSE,
|
||||
is_prior BOOLEAN DEFAULT FALSE,
|
||||
dataset_hash VARCHAR(32),
|
||||
dataset_date DATE,
|
||||
dataset_size_mb FLOAT,
|
||||
bill_count INTEGER DEFAULT 0,
|
||||
imported_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS legiscan_bills (
|
||||
bill_id INTEGER PRIMARY KEY,
|
||||
session_id INTEGER REFERENCES legiscan_sessions(session_id),
|
||||
state VARCHAR(2) NOT NULL,
|
||||
bill_number VARCHAR(50),
|
||||
bill_type VARCHAR(10),
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
status INTEGER,
|
||||
status_date DATE,
|
||||
completed INTEGER DEFAULT 0,
|
||||
body VARCHAR(10),
|
||||
url TEXT,
|
||||
state_link TEXT,
|
||||
change_hash VARCHAR(32),
|
||||
subjects TEXT[],
|
||||
sponsor_count INTEGER DEFAULT 0,
|
||||
vote_count INTEGER DEFAULT 0,
|
||||
text_count INTEGER DEFAULT 0,
|
||||
is_relevant BOOLEAN DEFAULT FALSE,
|
||||
relevance_tags TEXT[],
|
||||
imported_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_state ON legiscan_bills(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_session ON legiscan_bills(session_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_status ON legiscan_bills(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_relevant ON legiscan_bills(is_relevant) WHERE is_relevant;
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_subjects ON legiscan_bills USING gin(subjects);
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_rtags ON legiscan_bills USING gin(relevance_tags);
|
||||
CREATE INDEX IF NOT EXISTS idx_ls_bills_fts ON legiscan_bills
|
||||
USING gin(to_tsvector('english',
|
||||
COALESCE(title, '') || ' ' || COALESCE(description, '')));
|
||||
"""
|
||||
|
||||
|
||||
def setup_db(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(DDL)
|
||||
conn.commit()
|
||||
log.info("Database tables and indexes ready.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LegiScan API helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _api_get(params: dict, timeout: int = 120) -> dict:
|
||||
"""Make one LegiScan API call and return the parsed JSON."""
|
||||
params["key"] = API_KEY
|
||||
resp = requests.get(API_BASE, params=params, timeout=timeout)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if data.get("status") != "OK":
|
||||
raise RuntimeError(f"LegiScan API error: {data}")
|
||||
return data
|
||||
|
||||
|
||||
def get_all_dataset_metadata(year_start: int, state_filter: Optional[str] = None) -> list[dict]:
|
||||
"""Fetch full dataset list (one API call), filter to year_start+."""
|
||||
log.info("Fetching dataset list from LegiScan…")
|
||||
data = _api_get({"op": "getDatasetList"})
|
||||
sessions = data["datasetlist"]
|
||||
log.info(f" Total sessions across all states: {len(sessions)}")
|
||||
sessions = [s for s in sessions if s["year_start"] >= year_start]
|
||||
if state_filter:
|
||||
# Need to map state abbr → state_id. Derive from a quick per-state call.
|
||||
log.info(f" Filtering to state {state_filter}…")
|
||||
state_data = _api_get({"op": "getDatasetList", "state": state_filter})
|
||||
valid_ids = {s["session_id"] for s in state_data["datasetlist"]}
|
||||
sessions = [s for s in sessions if s["session_id"] in valid_ids]
|
||||
log.info(f" Sessions matching filters: {len(sessions)}")
|
||||
return sessions
|
||||
|
||||
|
||||
def download_dataset_zip(session: dict, dry_run: bool = False) -> tuple[Optional[bytes], bool]:
|
||||
"""Download a dataset ZIP via the API; cache to disk.
|
||||
Returns (zip_bytes, api_call_made) — api_call_made is True only when the
|
||||
network was actually hit so the caller can rate-limit appropriately."""
|
||||
session_id = session["session_id"]
|
||||
dataset_hash = session["dataset_hash"]
|
||||
access_key = session["access_key"]
|
||||
|
||||
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
cache_path = CACHE_DIR / f"{session_id}_{dataset_hash}.zip"
|
||||
|
||||
if cache_path.exists():
|
||||
log.debug(f" Cache hit: {cache_path.name}")
|
||||
return cache_path.read_bytes(), False
|
||||
|
||||
if dry_run:
|
||||
log.info(f" [dry-run] Would download session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)")
|
||||
return None, False
|
||||
|
||||
log.info(f" Downloading session {session_id} ({session['dataset_size'] / 1e6:.1f} MB)…")
|
||||
data = _api_get({"op": "getDataset", "access_key": access_key, "id": session_id})
|
||||
zip_bytes = base64.b64decode(data["dataset"]["zip"])
|
||||
cache_path.write_bytes(zip_bytes)
|
||||
log.info(f" Cached → {cache_path.name}")
|
||||
return zip_bytes, True
|
||||
return zip_bytes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Relevance tagging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def score_relevance(title: str, description: str, subjects: list[str]) -> tuple[bool, list[str]]:
|
||||
"""Return (is_relevant, list_of_matched_tags)."""
|
||||
haystack = " ".join([
|
||||
(title or "").lower(),
|
||||
(description or "").lower(),
|
||||
" ".join(s.lower() for s in subjects),
|
||||
])
|
||||
tags = []
|
||||
for tag, keywords in RELEVANCE_KEYWORDS.items():
|
||||
if any(kw in haystack for kw in keywords):
|
||||
tags.append(tag)
|
||||
return bool(tags), tags
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ZIP processing and DB loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _state_abbr_from_zip(zf: zipfile.ZipFile) -> str:
|
||||
"""Extract the state abbreviation from the ZIP's path structure."""
|
||||
for name in zf.namelist():
|
||||
parts = name.split("/")
|
||||
if len(parts) >= 1 and len(parts[0]) == 2:
|
||||
return parts[0]
|
||||
return "??"
|
||||
|
||||
|
||||
def process_dataset(
|
||||
session: dict,
|
||||
zip_bytes: bytes,
|
||||
conn,
|
||||
state_abbr: Optional[str] = None,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False,
|
||||
) -> int:
|
||||
"""Parse all bill JSONs from a ZIP and upsert into legiscan_bills. Returns count."""
|
||||
session_id = session["session_id"]
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
if not state_abbr:
|
||||
state_abbr = _state_abbr_from_zip(zf)
|
||||
bill_files = [n for n in zf.namelist() if "/bill/" in n and n.endswith(".json")]
|
||||
|
||||
if not bill_files:
|
||||
log.warning(f" Session {session_id}: no bill files found in ZIP.")
|
||||
return 0
|
||||
|
||||
rows = []
|
||||
for fname in bill_files:
|
||||
try:
|
||||
raw = json.loads(zf.read(fname))
|
||||
b = raw.get("bill", raw)
|
||||
except Exception as e:
|
||||
log.warning(f" Could not parse {fname}: {e}")
|
||||
continue
|
||||
|
||||
subjects = [s["subject_name"] for s in (b.get("subjects") or []) if s.get("subject_name")]
|
||||
is_rel, tags = score_relevance(
|
||||
b.get("title", ""),
|
||||
b.get("description", ""),
|
||||
subjects,
|
||||
)
|
||||
|
||||
status_date = b.get("status_date") or None
|
||||
rows.append((
|
||||
b["bill_id"],
|
||||
session_id,
|
||||
b.get("state", state_abbr),
|
||||
b.get("bill_number"),
|
||||
b.get("bill_type"),
|
||||
b.get("title"),
|
||||
b.get("description"),
|
||||
b.get("status"),
|
||||
status_date,
|
||||
b.get("completed", 0),
|
||||
b.get("body"),
|
||||
b.get("url"),
|
||||
b.get("state_link"),
|
||||
b.get("change_hash"),
|
||||
subjects or None,
|
||||
len(b.get("sponsors") or []),
|
||||
len(b.get("votes") or []),
|
||||
len(b.get("texts") or []),
|
||||
is_rel,
|
||||
tags or None,
|
||||
))
|
||||
|
||||
if dry_run:
|
||||
log.info(f" [dry-run] Session {session_id} ({state_abbr}): would insert/update {len(rows)} bills")
|
||||
return len(rows)
|
||||
|
||||
UPSERT = """
|
||||
INSERT INTO legiscan_bills (
|
||||
bill_id, session_id, state, bill_number, bill_type,
|
||||
title, description, status, status_date, completed,
|
||||
body, url, state_link, change_hash, subjects,
|
||||
sponsor_count, vote_count, text_count,
|
||||
is_relevant, relevance_tags, imported_at
|
||||
) VALUES %s
|
||||
ON CONFLICT (bill_id) DO UPDATE SET
|
||||
change_hash = EXCLUDED.change_hash,
|
||||
status = EXCLUDED.status,
|
||||
status_date = EXCLUDED.status_date,
|
||||
completed = EXCLUDED.completed,
|
||||
subjects = EXCLUDED.subjects,
|
||||
sponsor_count = EXCLUDED.sponsor_count,
|
||||
vote_count = EXCLUDED.vote_count,
|
||||
text_count = EXCLUDED.text_count,
|
||||
is_relevant = EXCLUDED.is_relevant,
|
||||
relevance_tags = EXCLUDED.relevance_tags,
|
||||
imported_at = NOW()
|
||||
WHERE legiscan_bills.change_hash IS DISTINCT FROM EXCLUDED.change_hash
|
||||
"""
|
||||
|
||||
template = "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,NOW())"
|
||||
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.execute_values(cur, UPSERT, rows, template=template, page_size=500)
|
||||
count = cur.rowcount
|
||||
|
||||
# Update session bill_count
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE legiscan_sessions SET bill_count = %s, imported_at = NOW() WHERE session_id = %s",
|
||||
(len(rows), session_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
if verbose:
|
||||
relevant = sum(1 for r in rows if r[18])
|
||||
log.info(f" Session {session_id} ({state_abbr}): {len(rows)} bills, {relevant} relevant, {count} upserted")
|
||||
return len(rows)
|
||||
|
||||
|
||||
def upsert_session(session: dict, state_abbr: str, conn, dry_run: bool = False):
|
||||
"""Insert or update a session record."""
|
||||
if dry_run:
|
||||
return
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO legiscan_sessions
|
||||
(session_id, state_id, state_abbr, year_start, year_end,
|
||||
session_title, session_tag, is_special, is_prior,
|
||||
dataset_hash, dataset_date, dataset_size_mb)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT (session_id) DO UPDATE SET
|
||||
dataset_hash = EXCLUDED.dataset_hash,
|
||||
dataset_date = EXCLUDED.dataset_date,
|
||||
dataset_size_mb = EXCLUDED.dataset_size_mb,
|
||||
session_title = EXCLUDED.session_title
|
||||
""", (
|
||||
session["session_id"],
|
||||
session["state_id"],
|
||||
state_abbr,
|
||||
session["year_start"],
|
||||
session["year_end"],
|
||||
session.get("session_title"),
|
||||
session.get("session_tag"),
|
||||
bool(session.get("special")),
|
||||
bool(session.get("prior")),
|
||||
session.get("dataset_hash"),
|
||||
session.get("dataset_date"),
|
||||
session.get("dataset_size", 0) / 1e6,
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def needs_import(session: dict, conn) -> bool:
|
||||
"""Return True if this session's dataset_hash differs from what's in the DB."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT dataset_hash FROM legiscan_sessions WHERE session_id = %s",
|
||||
(session["session_id"],),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row is None:
|
||||
return True
|
||||
return row[0] != session["dataset_hash"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Retag phase
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def retag_all_bills(conn, dry_run: bool = False, verbose: bool = False):
|
||||
"""Re-score relevance for every bill already in the DB."""
|
||||
log.info("Re-tagging all bills…")
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute("SELECT bill_id, title, description, subjects FROM legiscan_bills")
|
||||
rows = cur.fetchall()
|
||||
|
||||
log.info(f" Scoring {len(rows)} bills…")
|
||||
updates = []
|
||||
for row in rows:
|
||||
is_rel, tags = score_relevance(
|
||||
row["title"] or "",
|
||||
row["description"] or "",
|
||||
row["subjects"] or [],
|
||||
)
|
||||
updates.append((is_rel, tags or None, row["bill_id"]))
|
||||
|
||||
if dry_run:
|
||||
relevant = sum(1 for u in updates if u[0])
|
||||
log.info(f" [dry-run] Would tag {relevant}/{len(updates)} bills as relevant")
|
||||
return
|
||||
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.execute_values(
|
||||
cur,
|
||||
"UPDATE legiscan_bills SET is_relevant = data.is_rel, relevance_tags = data.tags "
|
||||
"FROM (VALUES %s) AS data(is_rel, tags, bill_id) "
|
||||
"WHERE legiscan_bills.bill_id = data.bill_id::integer",
|
||||
updates,
|
||||
template="(%s, %s::text[], %s)",
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
relevant = sum(1 for u in updates if u[0])
|
||||
log.info(f" Tagged {relevant}/{len(updates)} bills as relevant.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def print_summary(conn):
|
||||
queries = {
|
||||
"Total sessions": "SELECT COUNT(*) FROM legiscan_sessions",
|
||||
"Total bills": "SELECT COUNT(*) FROM legiscan_bills",
|
||||
"Relevant bills": "SELECT COUNT(*) FROM legiscan_bills WHERE is_relevant",
|
||||
"States covered": "SELECT COUNT(DISTINCT state) FROM legiscan_bills",
|
||||
}
|
||||
print("\n--- LegiScan ingestion summary ---")
|
||||
with conn.cursor() as cur:
|
||||
for label, sql in queries.items():
|
||||
cur.execute(sql)
|
||||
print(f" {label}: {cur.fetchone()[0]:,}")
|
||||
|
||||
# Top relevance tags
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT tag, COUNT(*) AS n
|
||||
FROM legiscan_bills, unnest(relevance_tags) AS tag
|
||||
GROUP BY tag ORDER BY n DESC
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
if rows:
|
||||
print("\n Relevant bills by tag:")
|
||||
for tag, n in rows:
|
||||
print(f" {tag:<30} {n:>6,}")
|
||||
|
||||
# Top states for relevant bills
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT state, COUNT(*) AS n
|
||||
FROM legiscan_bills WHERE is_relevant
|
||||
GROUP BY state ORDER BY n DESC LIMIT 15
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
if rows:
|
||||
print("\n Top states by relevant bill count:")
|
||||
for state, n in rows:
|
||||
print(f" {state} {n:>5,}")
|
||||
print()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
p.add_argument("--all", action="store_true", help="Run setup-db + fetch + load + tag")
|
||||
p.add_argument("--setup-db", action="store_true", help="Create/update DB tables")
|
||||
p.add_argument("--fetch", action="store_true", help="Download dataset ZIPs")
|
||||
p.add_argument("--load", action="store_true", help="Load cached ZIPs into DB")
|
||||
p.add_argument("--tag", action="store_true", help="Retag all bills for relevance")
|
||||
p.add_argument("--state", default=None, metavar="XX", help="Limit to one state")
|
||||
p.add_argument("--year-start", type=int, default=MIN_YEAR_DEFAULT, dest="year_start")
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
if args.verbose:
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
if not API_KEY:
|
||||
log.error("LEGISCAN_API_KEY is not set.")
|
||||
sys.exit(1)
|
||||
|
||||
do_setup = args.all or args.setup_db
|
||||
do_fetch = args.all or args.fetch
|
||||
do_load = args.all or args.load
|
||||
do_tag = args.all or args.tag
|
||||
|
||||
if not any([do_setup, do_fetch, do_load, do_tag]):
|
||||
log.error("Specify at least one phase: --all, --setup-db, --fetch, --load, --tag")
|
||||
sys.exit(1)
|
||||
|
||||
conn = None if args.dry_run else get_db_connection()
|
||||
|
||||
# ── Setup ──────────────────────────────────────────────────────────────
|
||||
if do_setup:
|
||||
if args.dry_run:
|
||||
log.info("[dry-run] Would create legiscan_sessions and legiscan_bills tables.")
|
||||
else:
|
||||
setup_db(conn)
|
||||
|
||||
# ── Fetch + Load (interleaved per session for memory efficiency) ────────
|
||||
if do_fetch or do_load:
|
||||
sessions = get_all_dataset_metadata(args.year_start, state_filter=args.state)
|
||||
total = len(sessions)
|
||||
log.info(f"Processing {total} sessions (year_start ≥ {args.year_start})…")
|
||||
|
||||
total_bills = 0
|
||||
skipped = 0
|
||||
|
||||
for i, session in enumerate(sessions, 1):
|
||||
session_id = session["session_id"]
|
||||
state_id = session["state_id"]
|
||||
year_start = session["year_start"]
|
||||
title = session.get("session_title", "")
|
||||
|
||||
# Check if import needed
|
||||
if do_load and not args.dry_run and conn and not needs_import(session, conn):
|
||||
log.debug(f" [{i}/{total}] Session {session_id} ({title}) — hash unchanged, skipping.")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
log.info(f"[{i}/{total}] Session {session_id}: {title}")
|
||||
|
||||
# Download
|
||||
zip_bytes = None
|
||||
if do_fetch:
|
||||
try:
|
||||
zip_bytes, api_called = download_dataset_zip(session, dry_run=args.dry_run)
|
||||
if api_called:
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
except Exception as e:
|
||||
log.error(f" Download failed for session {session_id}: {e}")
|
||||
continue
|
||||
elif do_load:
|
||||
# Load from cache only
|
||||
cache_path = CACHE_DIR / f"{session_id}_{session['dataset_hash']}.zip"
|
||||
if not cache_path.exists():
|
||||
log.warning(f" Cache miss for session {session_id} — run --fetch first.")
|
||||
continue
|
||||
zip_bytes = cache_path.read_bytes()
|
||||
|
||||
# Derive state abbreviation from ZIP structure
|
||||
state_abbr = args.state
|
||||
if zip_bytes and not state_abbr:
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
state_abbr = _state_abbr_from_zip(zf)
|
||||
except Exception:
|
||||
state_abbr = "??"
|
||||
|
||||
# Upsert session record
|
||||
if do_load and not args.dry_run and conn and state_abbr:
|
||||
upsert_session(session, state_abbr, conn, dry_run=args.dry_run)
|
||||
|
||||
# Load bills
|
||||
if do_load and zip_bytes:
|
||||
try:
|
||||
n = process_dataset(
|
||||
session, zip_bytes, conn,
|
||||
state_abbr=state_abbr,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
total_bills += n
|
||||
except Exception as e:
|
||||
log.error(f" Load failed for session {session_id}: {e}")
|
||||
if conn:
|
||||
conn.rollback()
|
||||
|
||||
log.info(f"Fetch/load complete. Bills processed: {total_bills:,}. Skipped (up-to-date): {skipped}.")
|
||||
|
||||
# ── Tag ────────────────────────────────────────────────────────────────
|
||||
if do_tag and not (do_fetch or do_load):
|
||||
if args.dry_run or conn:
|
||||
retag_all_bills(conn, dry_run=args.dry_run, verbose=args.verbose)
|
||||
|
||||
# ── Summary ────────────────────────────────────────────────────────────
|
||||
if conn and not args.dry_run:
|
||||
print_summary(conn)
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
293
scripts/load_postgis_data_centers.py
Normal file
293
scripts/load_postgis_data_centers.py
Normal file
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
from decimal import Decimal
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
|
||||
CSV_PATH = "US_DC_Sample_geocoded.csv"
|
||||
IM3_CSV_PATH = "new/IM3_Existing_DataCenters.csv"
|
||||
TABLE = "public.us_dc_sample_geocoded"
|
||||
DB_NAME = "data_centers"
|
||||
|
||||
ALL_COLS = [
|
||||
"id",
|
||||
"provider",
|
||||
"facility_name",
|
||||
"url",
|
||||
"provider_url",
|
||||
"country",
|
||||
"state",
|
||||
"state_code",
|
||||
"city",
|
||||
"postal_code",
|
||||
"street_address",
|
||||
"address",
|
||||
"source_address",
|
||||
"phone",
|
||||
"area_sqft",
|
||||
"power_mw",
|
||||
"nearest_airport_miles",
|
||||
"has_bare_metal",
|
||||
"has_iaas",
|
||||
"has_internet_exchange",
|
||||
"has_colocation",
|
||||
"certifications",
|
||||
"content_summary",
|
||||
"path",
|
||||
"longitude",
|
||||
"latitude",
|
||||
"geocode_source",
|
||||
"geocode_precision",
|
||||
"geocode_status",
|
||||
"geocode_match_address",
|
||||
"census_status",
|
||||
"census_match_type",
|
||||
"census_input_address",
|
||||
"census_tiger_line_id",
|
||||
"census_side",
|
||||
"nominatim_display_name",
|
||||
"nominatim_osm_type",
|
||||
"nominatim_osm_id",
|
||||
]
|
||||
|
||||
INT_COLS = {"area_sqft", "census_tiger_line_id", "nominatim_osm_id"}
|
||||
NUM_COLS = {"power_mw", "nearest_airport_miles", "longitude", "latitude"}
|
||||
BOOL_COLS = {
|
||||
"has_bare_metal",
|
||||
"has_iaas",
|
||||
"has_internet_exchange",
|
||||
"has_colocation",
|
||||
}
|
||||
|
||||
|
||||
def to_int(value):
|
||||
if value in (None, ""):
|
||||
return None
|
||||
return int(Decimal(value))
|
||||
|
||||
|
||||
def to_decimal(value):
|
||||
return Decimal(value) if value not in (None, "") else None
|
||||
|
||||
|
||||
def to_bool(value):
|
||||
return bool(int(value)) if value not in (None, "") else None
|
||||
|
||||
|
||||
def convert(row, column):
|
||||
value = row.get(column)
|
||||
if column in INT_COLS:
|
||||
return to_int(value)
|
||||
if column in NUM_COLS:
|
||||
return to_decimal(value)
|
||||
if column in BOOL_COLS:
|
||||
return to_bool(value)
|
||||
return None if value == "" else value
|
||||
|
||||
|
||||
def normalize_geocoded_row(row):
|
||||
return {column: row.get(column, "") for column in ALL_COLS}
|
||||
|
||||
|
||||
def normalize_im3_row(row):
|
||||
return {
|
||||
"id": row.get("id", ""),
|
||||
"provider": row.get("operator", ""),
|
||||
"facility_name": row.get("name", ""),
|
||||
"url": "",
|
||||
"provider_url": "",
|
||||
"country": "United States",
|
||||
"state": row.get("state", ""),
|
||||
"state_code": row.get("state_abb", ""),
|
||||
"city": "",
|
||||
"postal_code": "",
|
||||
"street_address": "",
|
||||
"address": "",
|
||||
"source_address": "",
|
||||
"phone": "",
|
||||
"area_sqft": row.get("sqft", ""),
|
||||
"power_mw": "",
|
||||
"nearest_airport_miles": "",
|
||||
"has_bare_metal": "",
|
||||
"has_iaas": "",
|
||||
"has_internet_exchange": "",
|
||||
"has_colocation": "",
|
||||
"certifications": "",
|
||||
"content_summary": "",
|
||||
"path": "IM3_Existing_DataCenters.csv",
|
||||
"longitude": row.get("lon", ""),
|
||||
"latitude": row.get("lat", ""),
|
||||
"geocode_source": "IM3_Existing_DataCenters",
|
||||
"geocode_precision": row.get("type", "") or "im3",
|
||||
"geocode_status": "im3_imported",
|
||||
"geocode_match_address": "",
|
||||
"census_status": "",
|
||||
"census_match_type": "",
|
||||
"census_input_address": "",
|
||||
"census_tiger_line_id": "",
|
||||
"census_side": "",
|
||||
"nominatim_display_name": "",
|
||||
"nominatim_osm_type": "",
|
||||
"nominatim_osm_id": "",
|
||||
}
|
||||
|
||||
|
||||
def read_and_normalize_rows(csv_path, source):
|
||||
with open(csv_path, newline="", encoding="utf-8") as csv_file:
|
||||
rows = list(csv.DictReader(csv_file))
|
||||
|
||||
if source == "im3":
|
||||
normalized = [normalize_im3_row(row) for row in rows]
|
||||
else:
|
||||
normalized = [normalize_geocoded_row(row) for row in rows]
|
||||
|
||||
deduped = {}
|
||||
for row in normalized:
|
||||
row_id = (row.get("id") or "").strip()
|
||||
if not row_id:
|
||||
continue
|
||||
deduped[row_id] = row
|
||||
|
||||
values = [tuple(convert(row, column) for column in ALL_COLS) for row in deduped.values()]
|
||||
return rows, values
|
||||
|
||||
|
||||
def create_table(cur):
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {TABLE} (
|
||||
id text primary key,
|
||||
provider text,
|
||||
facility_name text,
|
||||
url text,
|
||||
provider_url text,
|
||||
country text,
|
||||
state text,
|
||||
state_code text,
|
||||
city text,
|
||||
postal_code text,
|
||||
street_address text,
|
||||
address text,
|
||||
source_address text,
|
||||
phone text,
|
||||
area_sqft integer,
|
||||
power_mw numeric,
|
||||
nearest_airport_miles numeric,
|
||||
has_bare_metal boolean,
|
||||
has_iaas boolean,
|
||||
has_internet_exchange boolean,
|
||||
has_colocation boolean,
|
||||
certifications text,
|
||||
content_summary text,
|
||||
path text,
|
||||
longitude double precision not null,
|
||||
latitude double precision not null,
|
||||
geocode_source text,
|
||||
geocode_precision text,
|
||||
geocode_status text,
|
||||
geocode_match_address text,
|
||||
census_status text,
|
||||
census_match_type text,
|
||||
census_input_address text,
|
||||
census_tiger_line_id bigint,
|
||||
census_side text,
|
||||
nominatim_display_name text,
|
||||
nominatim_osm_type text,
|
||||
nominatim_osm_id bigint,
|
||||
geom geometry(Point, 4326) generated always as
|
||||
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def insert_values(cur, values, upsert):
|
||||
insert_sql = f"insert into {TABLE} ({', '.join(ALL_COLS)}) values %s"
|
||||
if upsert:
|
||||
update_cols = [col for col in ALL_COLS if col != "id"]
|
||||
assignments = ", ".join(f"{col} = excluded.{col}" for col in update_cols)
|
||||
insert_sql += f" on conflict (id) do update set {assignments}"
|
||||
execute_values(cur, insert_sql, values, page_size=100)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load data-center CSV data into public.us_dc_sample_geocoded."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
choices=["geocoded", "im3"],
|
||||
default="geocoded",
|
||||
help="Input schema type. Use 'im3' for new/IM3_Existing_DataCenters.csv.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv-path",
|
||||
help="Override input CSV path. If omitted, uses a source-specific default.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--append",
|
||||
action="store_true",
|
||||
help="Append/upsert into an existing target table instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--upsert",
|
||||
action="store_true",
|
||||
help="On id conflicts, update the existing row. Recommended with --append.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
default_csv = IM3_CSV_PATH if args.source == "im3" else CSV_PATH
|
||||
csv_path = args.csv_path or default_csv
|
||||
rows, values = read_and_normalize_rows(csv_path, args.source)
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create extension if not exists postgis")
|
||||
cur.execute("select to_regclass(%s)", (TABLE,))
|
||||
table_exists = cur.fetchone()[0] is not None
|
||||
|
||||
if not table_exists:
|
||||
create_table(cur)
|
||||
cur.execute(
|
||||
f"create index us_dc_sample_geocoded_geom_gix on {TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index us_dc_sample_geocoded_state_city_idx on {TABLE} (state_code, city)"
|
||||
)
|
||||
elif not args.append:
|
||||
raise RuntimeError(
|
||||
f"Target table {TABLE} already exists; use --append to add data."
|
||||
)
|
||||
|
||||
insert_values(cur, values, upsert=args.upsert)
|
||||
cur.execute(f"analyze {TABLE}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
source_label = "IM3-adapted" if args.source == "im3" else "geocoded"
|
||||
mode = "append" if args.append else "create"
|
||||
conflict_mode = "upsert" if args.upsert else "insert"
|
||||
print(
|
||||
f"loaded {len(values)} {source_label} rows into {TABLE} "
|
||||
f"(mode={mode}, conflict={conflict_mode}, csv={csv_path})"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
428
scripts/load_postgis_internet_cables.py
Normal file
428
scripts/load_postgis_internet_cables.py
Normal file
@@ -0,0 +1,428 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Load internet_cables/*.json into PostGIS.
|
||||
|
||||
Reads:
|
||||
- internet_cables/all_cables.json -> public.internet_cables (+ landing points)
|
||||
- internet_cables/city_dominance_2026.json -> public.internet_city_dominance
|
||||
- internet_cables/year-summaries.json -> public.internet_cable_year_summaries
|
||||
- internet_cables/meta.json -> public.internet_cable_meta
|
||||
|
||||
Requires env vars: PGWEB_HOST, PGWEB_PORT, PGWEB_USER, PGWEB_PASSWORD.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from decimal import Decimal
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import Json, execute_values
|
||||
|
||||
|
||||
DATA_DIR = "internet_cables"
|
||||
DB_NAME = "data_centers"
|
||||
|
||||
CABLES_TABLE = "public.internet_cables"
|
||||
LANDINGS_TABLE = "public.internet_cable_landing_points"
|
||||
CITY_TABLE = "public.internet_city_dominance"
|
||||
YEAR_TABLE = "public.internet_cable_year_summaries"
|
||||
META_TABLE = "public.internet_cable_meta"
|
||||
|
||||
LENGTH_KM_RE = re.compile(r"([\d,\.]+)\s*km", re.IGNORECASE)
|
||||
|
||||
|
||||
def parse_length_km(raw):
|
||||
if not raw:
|
||||
return None
|
||||
match = LENGTH_KM_RE.search(raw)
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
return Decimal(match.group(1).replace(",", ""))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def to_int(value):
|
||||
if value in (None, ""):
|
||||
return None
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_bool(value):
|
||||
if value is None:
|
||||
return None
|
||||
return bool(value)
|
||||
|
||||
|
||||
def linestring_to_wkt(coords):
|
||||
return "(" + ", ".join(f"{lon} {lat}" for lon, lat in coords) + ")"
|
||||
|
||||
|
||||
def feature_to_multilinestring_wkt(geometry):
|
||||
gtype = geometry.get("type")
|
||||
coords = geometry.get("coordinates") or []
|
||||
if gtype == "MultiLineString":
|
||||
parts = [linestring_to_wkt(line) for line in coords if len(line) >= 2]
|
||||
elif gtype == "LineString":
|
||||
parts = [linestring_to_wkt(coords)] if len(coords) >= 2 else []
|
||||
else:
|
||||
return None
|
||||
if not parts:
|
||||
return None
|
||||
return "MULTILINESTRING(" + ", ".join(parts) + ")"
|
||||
|
||||
|
||||
def create_cable_tables(cur):
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {CABLES_TABLE} (
|
||||
feature_id text primary key,
|
||||
cable_id text,
|
||||
name text,
|
||||
color text,
|
||||
owners text,
|
||||
rfs_year integer,
|
||||
decommission_year integer,
|
||||
length_raw text,
|
||||
length_km numeric,
|
||||
cable_type text,
|
||||
url text,
|
||||
extra_urls jsonb,
|
||||
properties jsonb,
|
||||
geom geometry(MultiLineString, 4326)
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_cables_geom_gix on {CABLES_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_cables_cable_id_idx on {CABLES_TABLE} (cable_id)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_cables_rfs_year_idx on {CABLES_TABLE} (rfs_year)"
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {LANDINGS_TABLE} (
|
||||
feature_id text references {CABLES_TABLE}(feature_id) on delete cascade,
|
||||
ordinal integer,
|
||||
landing_id text,
|
||||
name text,
|
||||
country text,
|
||||
is_tbd boolean,
|
||||
primary key (feature_id, ordinal)
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_cable_landings_landing_id_idx on {LANDINGS_TABLE} (landing_id)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_cable_landings_country_idx on {LANDINGS_TABLE} (country)"
|
||||
)
|
||||
|
||||
|
||||
def create_city_table(cur):
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {CITY_TABLE} (
|
||||
id text primary key,
|
||||
city text,
|
||||
country text,
|
||||
country_name text,
|
||||
region text,
|
||||
status text,
|
||||
physical_capacity_tbps numeric,
|
||||
added_physical_capacity_tbps numeric,
|
||||
logical_dominance_ips bigint,
|
||||
top_asns jsonb,
|
||||
longitude double precision,
|
||||
latitude double precision,
|
||||
geom geometry(Point, 4326) generated always as
|
||||
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_city_dominance_geom_gix on {CITY_TABLE} using gist (geom)"
|
||||
)
|
||||
cur.execute(
|
||||
f"create index internet_city_dominance_country_idx on {CITY_TABLE} (country)"
|
||||
)
|
||||
|
||||
|
||||
def create_year_table(cur):
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {YEAR_TABLE} (
|
||||
year integer primary key,
|
||||
description text
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def create_meta_table(cur):
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {META_TABLE} (
|
||||
key text primary key,
|
||||
value text
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def load_cables(cur, path):
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
features = json.load(fh)
|
||||
|
||||
cable_rows = []
|
||||
landing_rows = []
|
||||
used_feature_ids = set()
|
||||
|
||||
for idx, feature in enumerate(features):
|
||||
props = feature.get("properties") or {}
|
||||
feature_id = props.get("feature_id") or props.get("id")
|
||||
if not feature_id:
|
||||
feature_id = f"legacy-{idx}"
|
||||
# Disambiguate any residual collisions
|
||||
base = feature_id
|
||||
suffix = 1
|
||||
while feature_id in used_feature_ids:
|
||||
feature_id = f"{base}-{suffix}"
|
||||
suffix += 1
|
||||
used_feature_ids.add(feature_id)
|
||||
|
||||
# length may also live in a top-level lengthKm field on legacy entries
|
||||
length_raw = props.get("length")
|
||||
length_km = parse_length_km(length_raw)
|
||||
if length_km is None and feature.get("lengthKm") is not None:
|
||||
try:
|
||||
length_km = Decimal(str(feature["lengthKm"]))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
wkt = feature_to_multilinestring_wkt(feature.get("geometry") or {})
|
||||
cable_rows.append(
|
||||
(
|
||||
feature_id,
|
||||
props.get("id"),
|
||||
props.get("name"),
|
||||
props.get("color"),
|
||||
props.get("owners"),
|
||||
to_int(props.get("rfs_year")),
|
||||
to_int(props.get("decommission_year")),
|
||||
length_raw,
|
||||
length_km,
|
||||
props.get("type"),
|
||||
props.get("url"),
|
||||
Json(props.get("extraUrls") or []),
|
||||
Json(props),
|
||||
wkt,
|
||||
)
|
||||
)
|
||||
|
||||
for ordinal, lp in enumerate(props.get("landing_points") or []):
|
||||
landing_rows.append(
|
||||
(
|
||||
feature_id,
|
||||
ordinal,
|
||||
lp.get("id") or None,
|
||||
lp.get("name"),
|
||||
lp.get("country"),
|
||||
to_bool(lp.get("is_tbd")),
|
||||
)
|
||||
)
|
||||
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {CABLES_TABLE} (
|
||||
feature_id, cable_id, name, color, owners, rfs_year, decommission_year,
|
||||
length_raw, length_km, cable_type, url, extra_urls, properties, geom
|
||||
) values %s
|
||||
""",
|
||||
cable_rows,
|
||||
template=(
|
||||
"(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
|
||||
"ST_GeomFromText(%s, 4326))"
|
||||
),
|
||||
page_size=200,
|
||||
)
|
||||
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {LANDINGS_TABLE} (feature_id, ordinal, landing_id, name, country, is_tbd)
|
||||
values %s
|
||||
""",
|
||||
landing_rows,
|
||||
page_size=500,
|
||||
)
|
||||
|
||||
return len(cable_rows), len(landing_rows)
|
||||
|
||||
|
||||
def load_city_dominance(cur, path):
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
items = json.load(fh)
|
||||
|
||||
rows = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
item_id = item.get("id")
|
||||
if not item_id or item_id in seen:
|
||||
continue
|
||||
seen.add(item_id)
|
||||
coords = item.get("coordinates") or [None, None]
|
||||
lon, lat = (coords + [None, None])[:2]
|
||||
rows.append(
|
||||
(
|
||||
item_id,
|
||||
item.get("city"),
|
||||
item.get("country"),
|
||||
item.get("country_name"),
|
||||
item.get("region"),
|
||||
item.get("status"),
|
||||
item.get("physical_capacity_tbps"),
|
||||
item.get("added_physical_capacity_tbps"),
|
||||
item.get("logical_dominance_ips"),
|
||||
Json(item.get("top_asns") or []),
|
||||
lon,
|
||||
lat,
|
||||
)
|
||||
)
|
||||
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
insert into {CITY_TABLE} (
|
||||
id, city, country, country_name, region, status,
|
||||
physical_capacity_tbps, added_physical_capacity_tbps,
|
||||
logical_dominance_ips, top_asns, longitude, latitude
|
||||
) values %s
|
||||
""",
|
||||
rows,
|
||||
page_size=500,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def load_year_summaries(cur, path):
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
rows = []
|
||||
for year_key, value in data.items():
|
||||
year = to_int(year_key)
|
||||
if year is None:
|
||||
continue
|
||||
description = value.get("description") if isinstance(value, dict) else str(value)
|
||||
rows.append((year, description))
|
||||
execute_values(
|
||||
cur,
|
||||
f"insert into {YEAR_TABLE} (year, description) values %s",
|
||||
rows,
|
||||
page_size=200,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def load_meta(cur, path):
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
rows = [(str(k), str(v)) for k, v in data.items()]
|
||||
execute_values(
|
||||
cur,
|
||||
f"insert into {META_TABLE} (key, value) values %s",
|
||||
rows,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load internet_cables/*.json into PostGIS."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
default=DATA_DIR,
|
||||
help=f"Directory containing the JSON files (default: {DATA_DIR})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace",
|
||||
action="store_true",
|
||||
help="Drop existing target tables before loading.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
cables_path = os.path.join(args.data_dir, "all_cables.json")
|
||||
city_path = os.path.join(args.data_dir, "city_dominance_2026.json")
|
||||
year_path = os.path.join(args.data_dir, "year-summaries.json")
|
||||
meta_path = os.path.join(args.data_dir, "meta.json")
|
||||
|
||||
for path in [cables_path, city_path, year_path, meta_path]:
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create extension if not exists postgis")
|
||||
|
||||
if args.replace:
|
||||
cur.execute(
|
||||
f"drop table if exists {LANDINGS_TABLE}, {CABLES_TABLE}, "
|
||||
f"{CITY_TABLE}, {YEAR_TABLE}, {META_TABLE} cascade"
|
||||
)
|
||||
|
||||
for table, creator in [
|
||||
(CABLES_TABLE, lambda c: create_cable_tables(c)),
|
||||
(CITY_TABLE, create_city_table),
|
||||
(YEAR_TABLE, create_year_table),
|
||||
(META_TABLE, create_meta_table),
|
||||
]:
|
||||
cur.execute("select to_regclass(%s)", (table,))
|
||||
if cur.fetchone()[0] is not None:
|
||||
raise RuntimeError(
|
||||
f"Target table {table} already exists; rerun with --replace to overwrite."
|
||||
)
|
||||
creator(cur)
|
||||
|
||||
cable_count, landing_count = load_cables(cur, cables_path)
|
||||
city_count = load_city_dominance(cur, city_path)
|
||||
year_count = load_year_summaries(cur, year_path)
|
||||
meta_count = load_meta(cur, meta_path)
|
||||
|
||||
for table in [CABLES_TABLE, LANDINGS_TABLE, CITY_TABLE, YEAR_TABLE, META_TABLE]:
|
||||
cur.execute(f"analyze {table}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print(
|
||||
f"loaded {cable_count} cables, {landing_count} landing points, "
|
||||
f"{city_count} city-dominance points, {year_count} year summaries, "
|
||||
f"{meta_count} meta rows."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
376
scripts/load_postgis_osm_data_centers.py
Normal file
376
scripts/load_postgis_osm_data_centers.py
Normal file
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch US data centers from OpenStreetMap (Overpass API) and load them into
|
||||
public.osm_data_centers in the data_centers database. Also (re)creates a
|
||||
unioned view public.data_centers_union combining OSM + curated rows from
|
||||
public.us_dc_sample_geocoded.
|
||||
|
||||
Two Overpass passes are made because tagging is inconsistent:
|
||||
1) telecom=data_center
|
||||
2) building=data_center
|
||||
|
||||
Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
|
||||
in match_tags so we can see which query found each feature.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from psycopg2.extras import Json, execute_values
|
||||
|
||||
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
||||
TABLE = "public.osm_data_centers"
|
||||
VIEW = "public.data_centers_union"
|
||||
CURATED_TABLE = "public.us_dc_sample_geocoded"
|
||||
DB_NAME = "data_centers"
|
||||
|
||||
# Tag passes: (key, value)
|
||||
TAG_PASSES = [
|
||||
("telecom", "data_center"),
|
||||
("building", "data_center"),
|
||||
]
|
||||
|
||||
|
||||
def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
|
||||
return f"""
|
||||
[out:json][timeout:{timeout}];
|
||||
area["ISO3166-1"="US"][admin_level=2]->.us;
|
||||
(
|
||||
node["{tag_key}"="{tag_value}"](area.us);
|
||||
way["{tag_key}"="{tag_value}"](area.us);
|
||||
relation["{tag_key}"="{tag_value}"](area.us);
|
||||
);
|
||||
out center tags;
|
||||
""".strip()
|
||||
|
||||
|
||||
def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
|
||||
if cache_path and os.path.exists(cache_path):
|
||||
print(f" using cached response: {cache_path}")
|
||||
with open(cache_path, "r", encoding="utf-8") as fh:
|
||||
payload = json.load(fh)
|
||||
else:
|
||||
query = overpass_query(tag_key, tag_value)
|
||||
print(f" querying Overpass for {tag_key}={tag_value} ...")
|
||||
headers = {
|
||||
"User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
resp = requests.post(
|
||||
OVERPASS_URL,
|
||||
data={"data": query},
|
||||
headers=headers,
|
||||
timeout=240,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
print(f" Overpass returned {resp.status_code}: {resp.text[:500]}")
|
||||
resp.raise_for_status()
|
||||
payload = resp.json()
|
||||
if cache_path:
|
||||
with open(cache_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(payload, fh)
|
||||
print(f" cached to {cache_path}")
|
||||
elements = payload.get("elements", [])
|
||||
print(f" pass returned {len(elements)} elements")
|
||||
return elements
|
||||
|
||||
|
||||
def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
|
||||
if elem.get("type") == "node":
|
||||
return elem.get("lon"), elem.get("lat")
|
||||
center = elem.get("center") or {}
|
||||
return center.get("lon"), center.get("lat")
|
||||
|
||||
|
||||
def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
|
||||
lon, lat = element_coords(elem)
|
||||
if lon is None or lat is None:
|
||||
return None
|
||||
osm_type = elem.get("type")
|
||||
osm_id = elem.get("id")
|
||||
if osm_type is None or osm_id is None:
|
||||
return None
|
||||
tags = elem.get("tags") or {}
|
||||
return {
|
||||
"id": f"{osm_type}/{osm_id}",
|
||||
"osm_type": osm_type,
|
||||
"osm_id": int(osm_id),
|
||||
"name": tags.get("name"),
|
||||
"operator": tags.get("operator"),
|
||||
"operator_type": tags.get("operator:type"),
|
||||
"telecom": tags.get("telecom"),
|
||||
"building": tags.get("building"),
|
||||
"power": tags.get("power"),
|
||||
"website": tags.get("website") or tags.get("contact:website"),
|
||||
"phone": tags.get("phone") or tags.get("contact:phone"),
|
||||
"street_address": " ".join(
|
||||
part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
|
||||
) or None,
|
||||
"city": tags.get("addr:city"),
|
||||
"state": tags.get("addr:state"),
|
||||
"postal_code": tags.get("addr:postcode"),
|
||||
"country": tags.get("addr:country") or "US",
|
||||
"matched_tags": [matched_tag],
|
||||
"tags": tags,
|
||||
"longitude": float(lon),
|
||||
"latitude": float(lat),
|
||||
}
|
||||
|
||||
|
||||
def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
|
||||
for row in new_rows:
|
||||
key = row["id"]
|
||||
prior = existing.get(key)
|
||||
if prior is None:
|
||||
existing[key] = row
|
||||
continue
|
||||
# merge matched_tags; keep first non-null values for other fields
|
||||
merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
|
||||
prior["matched_tags"] = merged_tags
|
||||
for col, val in row.items():
|
||||
if col == "matched_tags":
|
||||
continue
|
||||
if prior.get(col) in (None, "") and val not in (None, ""):
|
||||
prior[col] = val
|
||||
|
||||
|
||||
COLUMNS = [
|
||||
"id",
|
||||
"osm_type",
|
||||
"osm_id",
|
||||
"name",
|
||||
"operator",
|
||||
"operator_type",
|
||||
"telecom",
|
||||
"building",
|
||||
"power",
|
||||
"website",
|
||||
"phone",
|
||||
"street_address",
|
||||
"city",
|
||||
"state",
|
||||
"postal_code",
|
||||
"country",
|
||||
"matched_tags",
|
||||
"tags",
|
||||
"longitude",
|
||||
"latitude",
|
||||
]
|
||||
|
||||
|
||||
def row_to_tuple(row: dict) -> tuple:
|
||||
return (
|
||||
row["id"],
|
||||
row["osm_type"],
|
||||
row["osm_id"],
|
||||
row.get("name"),
|
||||
row.get("operator"),
|
||||
row.get("operator_type"),
|
||||
row.get("telecom"),
|
||||
row.get("building"),
|
||||
row.get("power"),
|
||||
row.get("website"),
|
||||
row.get("phone"),
|
||||
row.get("street_address"),
|
||||
row.get("city"),
|
||||
row.get("state"),
|
||||
row.get("postal_code"),
|
||||
row.get("country"),
|
||||
row.get("matched_tags", []),
|
||||
Json(row.get("tags", {})),
|
||||
row["longitude"],
|
||||
row["latitude"],
|
||||
)
|
||||
|
||||
|
||||
def create_table(cur) -> None:
|
||||
cur.execute(
|
||||
f"""
|
||||
create table {TABLE} (
|
||||
id text primary key,
|
||||
osm_type text not null,
|
||||
osm_id bigint not null,
|
||||
name text,
|
||||
operator text,
|
||||
operator_type text,
|
||||
telecom text,
|
||||
building text,
|
||||
power text,
|
||||
website text,
|
||||
phone text,
|
||||
street_address text,
|
||||
city text,
|
||||
state text,
|
||||
postal_code text,
|
||||
country text,
|
||||
matched_tags text[] not null default '{{}}',
|
||||
tags jsonb not null default '{{}}'::jsonb,
|
||||
longitude double precision not null,
|
||||
latitude double precision not null,
|
||||
ingested_at timestamptz not null default now(),
|
||||
geom geometry(Point, 4326) generated always as
|
||||
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
|
||||
cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
|
||||
cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
|
||||
|
||||
|
||||
def insert_values(cur, rows: List[dict], upsert: bool) -> None:
|
||||
sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
|
||||
if upsert:
|
||||
update_cols = [c for c in COLUMNS if c != "id"]
|
||||
assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
|
||||
sql += (
|
||||
f" on conflict (id) do update set {assignments}, "
|
||||
f"ingested_at = now()"
|
||||
)
|
||||
execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
|
||||
|
||||
|
||||
def create_or_replace_view(cur) -> None:
|
||||
cur.execute(
|
||||
f"""
|
||||
create or replace view {VIEW} as
|
||||
select
|
||||
'curated/' || id as id,
|
||||
'curated'::text as source,
|
||||
facility_name as name,
|
||||
provider as operator,
|
||||
street_address,
|
||||
city,
|
||||
state_code as state,
|
||||
postal_code,
|
||||
country,
|
||||
url as website,
|
||||
phone,
|
||||
longitude,
|
||||
latitude,
|
||||
geom
|
||||
from {CURATED_TABLE}
|
||||
union all
|
||||
select
|
||||
id,
|
||||
'osm'::text as source,
|
||||
name,
|
||||
operator,
|
||||
street_address,
|
||||
city,
|
||||
state,
|
||||
postal_code,
|
||||
country,
|
||||
website,
|
||||
phone,
|
||||
longitude,
|
||||
latitude,
|
||||
geom
|
||||
from {TABLE}
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
default="output",
|
||||
help="Directory to cache raw Overpass responses (default: output/).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-cache",
|
||||
action="store_true",
|
||||
help="Do not read or write Overpass cache files; always hit the API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--recreate",
|
||||
action="store_true",
|
||||
help=f"Drop and recreate {TABLE} before loading.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--upsert",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="On id conflicts, update the existing row (default: on).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-view",
|
||||
action="store_true",
|
||||
help=f"Do not create/replace the unioned view {VIEW}.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
os.makedirs(args.cache_dir, exist_ok=True)
|
||||
merged: Dict[str, dict] = {}
|
||||
for tag_key, tag_value in TAG_PASSES:
|
||||
cache_path = (
|
||||
None
|
||||
if args.no_cache
|
||||
else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
|
||||
)
|
||||
print(f"Pass: {tag_key}={tag_value}")
|
||||
elements = fetch_pass(tag_key, tag_value, cache_path)
|
||||
normalized = [
|
||||
row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
|
||||
if row is not None
|
||||
]
|
||||
print(f" normalized {len(normalized)} rows with coords")
|
||||
merge_records(merged, normalized)
|
||||
# be polite to Overpass between passes
|
||||
time.sleep(2)
|
||||
|
||||
rows = list(merged.values())
|
||||
print(f"Total deduped OSM data-center features: {len(rows)}")
|
||||
if not rows:
|
||||
print("No rows fetched; aborting DB load.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create extension if not exists postgis")
|
||||
if args.recreate:
|
||||
cur.execute(f"drop table if exists {TABLE} cascade")
|
||||
cur.execute("select to_regclass(%s)", (TABLE,))
|
||||
if cur.fetchone()[0] is None:
|
||||
create_table(cur)
|
||||
insert_values(cur, rows, upsert=args.upsert)
|
||||
cur.execute(f"analyze {TABLE}")
|
||||
if not args.skip_view:
|
||||
cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
|
||||
if cur.fetchone()[0] is not None:
|
||||
create_or_replace_view(cur)
|
||||
print(f"View {VIEW} (re)created.")
|
||||
else:
|
||||
print(
|
||||
f"Skipping view: {CURATED_TABLE} does not exist.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
cur.execute(f"select count(*) from {TABLE}")
|
||||
total = cur.fetchone()[0]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
245
scripts/make_data_center_map.py
Normal file
245
scripts/make_data_center_map.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
POINT_TABLE = "public.master_data_centers"
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def load_points(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
select
|
||||
master_id,
|
||||
source,
|
||||
coalesce(operator, '') as operator,
|
||||
coalesce(name, '') as name,
|
||||
coalesce(city, '') as city,
|
||||
coalesce(state, '') as state,
|
||||
longitude,
|
||||
latitude,
|
||||
coalesce(curated_id, '') as curated_id,
|
||||
coalesce(osm_id, '') as osm_id,
|
||||
coalesce(match_method, '') as match_method,
|
||||
coalesce(geoid, '') as geoid
|
||||
from {POINT_TABLE}
|
||||
where longitude is not null and latitude is not null
|
||||
"""
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
|
||||
points = []
|
||||
for row in rows:
|
||||
points.append(
|
||||
{
|
||||
"id": row[0],
|
||||
"source": row[1],
|
||||
"operator": row[2],
|
||||
"name": row[3],
|
||||
"city": row[4],
|
||||
"state": row[5],
|
||||
"lon": float(row[6]),
|
||||
"lat": float(row[7]),
|
||||
"curated_id": row[8],
|
||||
"osm_id": row[9],
|
||||
"match_method": row[10],
|
||||
"geoid": row[11],
|
||||
}
|
||||
)
|
||||
return points
|
||||
|
||||
|
||||
def compute_center(points):
|
||||
if not points:
|
||||
return 39.5, -98.35
|
||||
lat = sum(p["lat"] for p in points) / len(points)
|
||||
lon = sum(p["lon"] for p in points) / len(points)
|
||||
return lat, lon
|
||||
|
||||
|
||||
def build_stats(points):
|
||||
by_source = Counter(p["source"] or "(blank)" for p in points)
|
||||
by_match = Counter(p["match_method"] or "(none)" for p in points)
|
||||
return {
|
||||
"total": len(points),
|
||||
"by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
|
||||
"by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
|
||||
}
|
||||
|
||||
|
||||
def render_html(points, center_lat, center_lon, output_path):
|
||||
stats = build_stats(points)
|
||||
points_json = json.dumps(points)
|
||||
stats_json = json.dumps(stats)
|
||||
|
||||
html = f"""<!doctype html>
|
||||
<html lang=\"en\">
|
||||
<head>
|
||||
<meta charset=\"utf-8\" />
|
||||
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
||||
<title>US Data Centers Master Map</title>
|
||||
<link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
|
||||
<style>
|
||||
html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
|
||||
#layout {{ display: grid; grid-template-columns: 320px 1fr; height: 100%; }}
|
||||
#panel {{ padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; }}
|
||||
#map {{ height: 100%; width: 100%; }}
|
||||
h1 {{ margin: 0 0 8px; font-size: 18px; }}
|
||||
h2 {{ margin: 16px 0 8px; font-size: 14px; }}
|
||||
.stat-row {{ display: flex; justify-content: space-between; padding: 2px 0; font-size: 13px; }}
|
||||
.dot {{ width: 10px; height: 10px; border-radius: 50%; display: inline-block; margin-right: 8px; }}
|
||||
@media (max-width: 900px) {{
|
||||
#layout {{ grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }}
|
||||
#panel {{ border-right: 0; border-bottom: 1px solid #ddd; }}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=\"layout\">
|
||||
<div id=\"panel\">
|
||||
<h1>US Data Centers (Master)</h1>
|
||||
<div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
|
||||
<h2>Source</h2>
|
||||
<div id=\"sourceStats\"></div>
|
||||
<h2>Match Method (merged rows)</h2>
|
||||
<div id=\"matchStats\"></div>
|
||||
<h2>Source Colors</h2>
|
||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
|
||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
|
||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
|
||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
|
||||
</div>
|
||||
<div id=\"map\"></div>
|
||||
</div>
|
||||
|
||||
<script src=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.js\"></script>
|
||||
<script>
|
||||
const points = {points_json};
|
||||
const stats = {stats_json};
|
||||
|
||||
function colorForSource(source) {{
|
||||
if (source === 'merged') return '#2ca02c';
|
||||
if (source === 'curated') return '#1f77b4';
|
||||
if (source === 'osm') return '#ff7f0e';
|
||||
return '#7f7f7f';
|
||||
}}
|
||||
|
||||
function escapeHtml(value) {{
|
||||
return String(value || '')
|
||||
.replaceAll('&', '&')
|
||||
.replaceAll('<', '<')
|
||||
.replaceAll('>', '>')
|
||||
.replaceAll('"', '"')
|
||||
.replaceAll("'", ''');
|
||||
}}
|
||||
|
||||
const map = L.map('map', {{ preferCanvas: true }}).setView([{center_lat}, {center_lon}], 5);
|
||||
|
||||
L.tileLayer('https://tile.openstreetmap.org/{{z}}/{{x}}/{{y}}.png', {{
|
||||
maxZoom: 19,
|
||||
attribution: '© OpenStreetMap contributors'
|
||||
}}).addTo(map);
|
||||
|
||||
const bounds = [];
|
||||
for (const p of points) {{
|
||||
const marker = L.circleMarker([p.lat, p.lon], {{
|
||||
radius: 4,
|
||||
color: colorForSource(p.source),
|
||||
fillColor: colorForSource(p.source),
|
||||
fillOpacity: 0.7,
|
||||
weight: 1
|
||||
}});
|
||||
|
||||
const title = p.name || p.id;
|
||||
const operator = p.operator || '(unknown operator)';
|
||||
const cityState = [p.city, p.state].filter(Boolean).join(', ');
|
||||
const provenance = [
|
||||
p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
|
||||
p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
|
||||
p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
|
||||
].filter(Boolean).join('<br>');
|
||||
marker.bindPopup(`
|
||||
<strong>${{escapeHtml(title)}}</strong><br>
|
||||
Operator: ${{escapeHtml(operator)}}<br>
|
||||
Location: ${{escapeHtml(cityState)}}<br>
|
||||
Source: ${{escapeHtml(p.source)}}<br>
|
||||
${{provenance ? provenance + '<br>' : ''}}
|
||||
GEOID: ${{escapeHtml(p.geoid)}}
|
||||
`);
|
||||
|
||||
marker.addTo(map);
|
||||
bounds.push([p.lat, p.lon]);
|
||||
}}
|
||||
|
||||
if (bounds.length > 0) {{
|
||||
map.fitBounds(bounds, {{ padding: [20, 20] }});
|
||||
}}
|
||||
|
||||
document.getElementById('total').textContent = stats.total;
|
||||
|
||||
const sourceStats = document.getElementById('sourceStats');
|
||||
for (const [k, v] of Object.entries(stats.by_source)) {{
|
||||
const div = document.createElement('div');
|
||||
div.className = 'stat-row';
|
||||
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
|
||||
sourceStats.appendChild(div);
|
||||
}}
|
||||
|
||||
const matchStats = document.getElementById('matchStats');
|
||||
for (const [k, v] of Object.entries(stats.by_match_method)) {{
|
||||
const div = document.createElement('div');
|
||||
div.className = 'stat-row';
|
||||
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
|
||||
matchStats.appendChild(div);
|
||||
}}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate an interactive HTML map from the PostGIS point table."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="data_center_map.html",
|
||||
help="Output HTML path (default: data_center_map.html)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
conn = connect()
|
||||
try:
|
||||
points = load_points(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
center_lat, center_lon = compute_center(points)
|
||||
render_html(points, center_lat, center_lon, args.output)
|
||||
print(f"wrote {len(points)} points to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
338
scripts/make_internet_cables_map.py
Normal file
338
scripts/make_internet_cables_map.py
Normal file
@@ -0,0 +1,338 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Render a Leaflet HTML map combining US data centers, submarine cables,
|
||||
and city-level network-dominance points from PostGIS.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import psycopg2
|
||||
|
||||
|
||||
DB_NAME = "data_centers"
|
||||
DC_TABLE = "public.master_data_centers"
|
||||
CABLES_TABLE = "public.internet_cables"
|
||||
CITY_TABLE = "public.internet_city_dominance"
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname=DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def load_data_centers(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
select
|
||||
master_id,
|
||||
source,
|
||||
coalesce(operator, ''),
|
||||
coalesce(name, ''),
|
||||
coalesce(city, ''),
|
||||
coalesce(state, ''),
|
||||
longitude,
|
||||
latitude
|
||||
from {DC_TABLE}
|
||||
where longitude is not null and latitude is not null
|
||||
"""
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": r[0],
|
||||
"source": r[1],
|
||||
"operator": r[2],
|
||||
"name": r[3],
|
||||
"city": r[4],
|
||||
"state": r[5],
|
||||
"lon": float(r[6]),
|
||||
"lat": float(r[7]),
|
||||
}
|
||||
for r in cur.fetchall()
|
||||
]
|
||||
|
||||
|
||||
def load_cables(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
select
|
||||
feature_id,
|
||||
coalesce(cable_id, ''),
|
||||
coalesce(name, ''),
|
||||
coalesce(color, '#888888'),
|
||||
coalesce(owners, ''),
|
||||
rfs_year,
|
||||
decommission_year,
|
||||
length_km,
|
||||
coalesce(url, ''),
|
||||
ST_AsGeoJSON(geom)
|
||||
from {CABLES_TABLE}
|
||||
where geom is not null
|
||||
"""
|
||||
)
|
||||
features = []
|
||||
for r in cur.fetchall():
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": json.loads(r[9]),
|
||||
"properties": {
|
||||
"feature_id": r[0],
|
||||
"cable_id": r[1],
|
||||
"name": r[2],
|
||||
"color": r[3],
|
||||
"owners": r[4],
|
||||
"rfs_year": r[5],
|
||||
"decommission_year": r[6],
|
||||
"length_km": float(r[7]) if r[7] is not None else None,
|
||||
"url": r[8],
|
||||
},
|
||||
}
|
||||
)
|
||||
return {"type": "FeatureCollection", "features": features}
|
||||
|
||||
|
||||
def load_cities(conn, us_only=False):
|
||||
where = "where geom is not null"
|
||||
if us_only:
|
||||
where += " and country = 'US'"
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
select
|
||||
id,
|
||||
coalesce(city, ''),
|
||||
coalesce(country, ''),
|
||||
coalesce(country_name, ''),
|
||||
coalesce(region, ''),
|
||||
physical_capacity_tbps,
|
||||
logical_dominance_ips,
|
||||
longitude,
|
||||
latitude
|
||||
from {CITY_TABLE}
|
||||
{where}
|
||||
"""
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": r[0],
|
||||
"city": r[1],
|
||||
"country": r[2],
|
||||
"country_name": r[3],
|
||||
"region": r[4],
|
||||
"tbps": float(r[5]) if r[5] is not None else None,
|
||||
"ips": int(r[6]) if r[6] is not None else None,
|
||||
"lon": float(r[7]),
|
||||
"lat": float(r[8]),
|
||||
}
|
||||
for r in cur.fetchall()
|
||||
]
|
||||
|
||||
|
||||
def render_html(data_centers, cables_geojson, cities, output_path):
|
||||
payload = json.dumps(
|
||||
{
|
||||
"data_centers": data_centers,
|
||||
"cables": cables_geojson,
|
||||
"cities": cities,
|
||||
}
|
||||
)
|
||||
|
||||
html = """<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>US Data Centers + Submarine Cables</title>
|
||||
<link rel="stylesheet" href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css" />
|
||||
<style>
|
||||
html, body { height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }
|
||||
#layout { display: grid; grid-template-columns: 300px 1fr; height: 100%; }
|
||||
#panel { padding: 14px; border-right: 1px solid #ddd; overflow: auto; background: #f8fafb; font-size: 13px; }
|
||||
#map { height: 100%; width: 100%; }
|
||||
h1 { margin: 0 0 8px; font-size: 18px; }
|
||||
h2 { margin: 14px 0 6px; font-size: 13px; text-transform: uppercase; color: #555; letter-spacing: 0.04em; }
|
||||
.row { display: flex; justify-content: space-between; padding: 2px 0; }
|
||||
.swatch { width: 12px; height: 12px; display: inline-block; margin-right: 8px; vertical-align: middle; border: 1px solid #ccc; }
|
||||
label.toggle { display: block; padding: 3px 0; cursor: pointer; }
|
||||
@media (max-width: 900px) {
|
||||
#layout { grid-template-columns: 1fr; grid-template-rows: 220px 1fr; }
|
||||
#panel { border-right: 0; border-bottom: 1px solid #ddd; }
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="layout">
|
||||
<div id="panel">
|
||||
<h1>Data Centers + Cables</h1>
|
||||
<div class="row"><span>Data centers</span><strong id="dcCount"></strong></div>
|
||||
<div class="row"><span>Submarine cables</span><strong id="cableCount"></strong></div>
|
||||
<div class="row"><span>City dominance pts</span><strong id="cityCount"></strong></div>
|
||||
|
||||
<h2>Layers</h2>
|
||||
<label class="toggle"><input type="checkbox" id="tDc" checked> Data centers</label>
|
||||
<label class="toggle"><input type="checkbox" id="tCables" checked> Submarine cables</label>
|
||||
<label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
|
||||
|
||||
<h2>Data center source</h2>
|
||||
<div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
|
||||
<div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
|
||||
<div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
|
||||
<div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
|
||||
|
||||
<h2>City dominance</h2>
|
||||
<div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
|
||||
</div>
|
||||
<div id="map"></div>
|
||||
</div>
|
||||
|
||||
<script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js"></script>
|
||||
<script>
|
||||
const DATA = __PAYLOAD__;
|
||||
|
||||
function colorForSource(source) {
|
||||
if (source === 'merged') return '#2ca02c';
|
||||
if (source === 'curated') return '#1f77b4';
|
||||
if (source === 'osm') return '#ff7f0e';
|
||||
return '#7f7f7f';
|
||||
}
|
||||
|
||||
function esc(v) {
|
||||
return String(v == null ? '' : v)
|
||||
.replaceAll('&','&').replaceAll('<','<').replaceAll('>','>')
|
||||
.replaceAll('"','"').replaceAll("'", ''');
|
||||
}
|
||||
|
||||
const map = L.map('map', { preferCanvas: true, worldCopyJump: true }).setView([20, -40], 3);
|
||||
|
||||
L.tileLayer('https://tile.openstreetmap.org/{z}/{x}/{y}.png', {
|
||||
maxZoom: 19,
|
||||
attribution: '© OpenStreetMap contributors'
|
||||
}).addTo(map);
|
||||
|
||||
const cableLayer = L.geoJSON(DATA.cables, {
|
||||
style: f => ({
|
||||
color: f.properties.color || '#888',
|
||||
weight: 1.4,
|
||||
opacity: 0.75,
|
||||
}),
|
||||
onEachFeature: (feature, layer) => {
|
||||
const p = feature.properties;
|
||||
const yrs = [p.rfs_year, p.decommission_year].filter(Boolean).join(' – ');
|
||||
layer.bindPopup(`
|
||||
<strong>${esc(p.name)}</strong><br>
|
||||
${p.url ? `<a href="${esc(p.url)}" target="_blank" rel="noopener">${esc(p.url)}</a><br>` : ''}
|
||||
Owners: ${esc(p.owners)}<br>
|
||||
${yrs ? `Years: ${esc(yrs)}<br>` : ''}
|
||||
${p.length_km ? `Length: ${esc(p.length_km.toLocaleString())} km<br>` : ''}
|
||||
ID: ${esc(p.cable_id || p.feature_id)}
|
||||
`);
|
||||
},
|
||||
}).addTo(map);
|
||||
|
||||
const cityLayer = L.layerGroup();
|
||||
for (const c of DATA.cities) {
|
||||
const tbps = c.tbps || 0;
|
||||
const radius = Math.max(2, Math.min(18, Math.sqrt(tbps) * 1.6));
|
||||
const m = L.circleMarker([c.lat, c.lon], {
|
||||
radius,
|
||||
color: '#6c2a86',
|
||||
fillColor: '#9b59b6',
|
||||
fillOpacity: 0.45,
|
||||
weight: 0.8,
|
||||
});
|
||||
m.bindPopup(`
|
||||
<strong>${esc(c.city)}</strong> (${esc(c.country)})<br>
|
||||
Region: ${esc(c.region)}<br>
|
||||
Physical capacity: ${esc(tbps.toFixed ? tbps.toFixed(2) : tbps)} Tbps<br>
|
||||
Logical dominance IPs: ${esc(c.ips ? c.ips.toLocaleString() : '')}
|
||||
`);
|
||||
cityLayer.addLayer(m);
|
||||
}
|
||||
cityLayer.addTo(map);
|
||||
|
||||
const dcLayer = L.layerGroup();
|
||||
const dcBounds = [];
|
||||
for (const p of DATA.data_centers) {
|
||||
const m = L.circleMarker([p.lat, p.lon], {
|
||||
radius: 3,
|
||||
color: colorForSource(p.source),
|
||||
fillColor: colorForSource(p.source),
|
||||
fillOpacity: 0.85,
|
||||
weight: 0.8,
|
||||
});
|
||||
const title = p.name || p.id;
|
||||
const operator = p.operator || '(unknown operator)';
|
||||
const cityState = [p.city, p.state].filter(Boolean).join(', ');
|
||||
m.bindPopup(`
|
||||
<strong>${esc(title)}</strong><br>
|
||||
Operator: ${esc(operator)}<br>
|
||||
Location: ${esc(cityState)}<br>
|
||||
Source: ${esc(p.source)}
|
||||
`);
|
||||
dcLayer.addLayer(m);
|
||||
dcBounds.push([p.lat, p.lon]);
|
||||
}
|
||||
dcLayer.addTo(map);
|
||||
|
||||
if (dcBounds.length) map.fitBounds(dcBounds, { padding: [30, 30], maxZoom: 5 });
|
||||
|
||||
function toggle(layer, on) {
|
||||
if (on) { if (!map.hasLayer(layer)) layer.addTo(map); }
|
||||
else { if (map.hasLayer(layer)) map.removeLayer(layer); }
|
||||
}
|
||||
document.getElementById('tDc').addEventListener('change', e => toggle(dcLayer, e.target.checked));
|
||||
document.getElementById('tCables').addEventListener('change', e => toggle(cableLayer, e.target.checked));
|
||||
document.getElementById('tCities').addEventListener('change', e => toggle(cityLayer, e.target.checked));
|
||||
|
||||
document.getElementById('dcCount').textContent = DATA.data_centers.length.toLocaleString();
|
||||
document.getElementById('cableCount').textContent = DATA.cables.features.length.toLocaleString();
|
||||
document.getElementById('cityCount').textContent = DATA.cities.length.toLocaleString();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
html = html.replace("__PAYLOAD__", payload)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(
|
||||
description="Render a Leaflet map combining data centers, submarine cables, and city dominance."
|
||||
)
|
||||
p.add_argument("--output", default="data_centers_cables_map.html")
|
||||
p.add_argument(
|
||||
"--us-cities-only",
|
||||
action="store_true",
|
||||
help="Restrict the city-dominance layer to country='US'.",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
conn = connect()
|
||||
try:
|
||||
dcs = load_data_centers(conn)
|
||||
cables = load_cables(conn)
|
||||
cities = load_cities(conn, us_only=args.us_cities_only)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
render_html(dcs, cables, cities, args.output)
|
||||
print(
|
||||
f"wrote {len(dcs)} data centers, "
|
||||
f"{len(cables['features'])} cables, "
|
||||
f"{len(cities)} city points -> {args.output}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user