Add master data center merge workflow

This commit is contained in:
2026-05-17 18:53:16 -07:00
parent 8fcbb18e37
commit 90e8b21423
10 changed files with 11892 additions and 9599 deletions

View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Build (or refresh) public.master_data_centers by merging:
- public.us_dc_sample_geocoded (curated, attribute-rich)
- public.osm_data_centers (OpenStreetMap features)
Deduplication rule (curated row wins):
Step 1: for each curated row, find a matching OSM row by
curated.id = osm.osm_id::text OR
curated.nominatim_osm_id = osm.osm_id OR
ST_DWithin(curated.geom, osm.geom, 150 m, geography)
(closest match by sphere distance when multiple).
Step 2: insert every curated row into master, filling NULLs from the
matched OSM row when present. source = 'merged' if matched,
otherwise 'curated'.
Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
source = 'osm'.
Result: every curated row appears once; OSM-only rows appear once; no row is
emitted twice. The merge logic lives in a SQL function
public.refresh_master_data_centers() so subsequent refreshes are one call.
"""
import argparse
import os
import sys
import psycopg2
DB_NAME = "data_centers"
MASTER_TABLE = "public.master_data_centers"
CURATED_TABLE = "public.us_dc_sample_geocoded"
OSM_TABLE = "public.osm_data_centers"
MATCH_RADIUS_M = 150
CREATE_TABLE_SQL = f"""
create table if not exists {MASTER_TABLE} (
master_id text primary key,
source text not null check (source in ('curated','osm','merged')),
curated_id text,
osm_id text,
name text,
operator text,
street_address text,
city text,
state text,
postal_code text,
country text,
website text,
phone text,
power_mw numeric,
area_sqft integer,
nearest_airport_miles numeric,
has_bare_metal boolean,
has_iaas boolean,
has_internet_exchange boolean,
has_colocation boolean,
certifications text,
content_summary text,
osm_tags jsonb,
matched_osm_tag_passes text[],
match_method text,
match_distance_m numeric,
longitude double precision not null,
latitude double precision not null,
geom geometry(Point, 4326)
generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
);
create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
"""
REFRESH_FUNCTION_SQL = f"""
create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
language plpgsql
as $$
begin
truncate table {MASTER_TABLE};
-- pick a single best OSM match for each curated row, prioritizing ID
-- equality, then nominatim id, then closest within radius
create temporary table _curated_to_osm on commit drop as
with ranked as (
select
c.id as curated_id,
o.id as osm_id,
case
when c.id = o.osm_id::text then 'id'
when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
else 'spatial'
end as method,
ST_DistanceSphere(c.geom, o.geom) as dist_m,
row_number() over (
partition by c.id
order by
case
when c.id = o.osm_id::text then 0
when c.nominatim_osm_id = o.osm_id then 1
else 2
end,
ST_DistanceSphere(c.geom, o.geom) asc
) as rn
from {CURATED_TABLE} c
join {OSM_TABLE} o
on c.id = o.osm_id::text
or c.nominatim_osm_id = o.osm_id
or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
)
select curated_id, osm_id, method, dist_m
from ranked
where rn = 1;
-- Step 1+2: insert curated rows (with OSM nulls filled where matched)
insert into {MASTER_TABLE} (
master_id, source, curated_id, osm_id,
name, operator, street_address, city, state, postal_code, country,
website, phone, power_mw, area_sqft, nearest_airport_miles,
has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
certifications, content_summary,
osm_tags, matched_osm_tag_passes,
match_method, match_distance_m,
longitude, latitude
)
select
'curated/' || c.id,
case when m.osm_id is not null then 'merged' else 'curated' end,
c.id,
m.osm_id,
coalesce(c.facility_name, o.name),
coalesce(c.provider, o.operator),
coalesce(c.street_address, o.street_address),
coalesce(c.city, o.city),
coalesce(c.state_code, o.state),
coalesce(c.postal_code, o.postal_code),
coalesce(c.country, o.country),
coalesce(c.url, o.website),
coalesce(c.phone, o.phone),
c.power_mw,
c.area_sqft,
c.nearest_airport_miles,
c.has_bare_metal,
c.has_iaas,
c.has_internet_exchange,
c.has_colocation,
c.certifications,
c.content_summary,
o.tags,
o.matched_tags,
m.method,
round(m.dist_m::numeric, 2),
c.longitude,
c.latitude
from {CURATED_TABLE} c
left join _curated_to_osm m on m.curated_id = c.id
left join {OSM_TABLE} o on o.id = m.osm_id;
-- Step 3: insert OSM rows that no curated row claimed
insert into {MASTER_TABLE} (
master_id, source, curated_id, osm_id,
name, operator, street_address, city, state, postal_code, country,
website, phone,
osm_tags, matched_osm_tag_passes,
longitude, latitude
)
select
'osm/' || o.id,
'osm',
null,
o.id,
o.name,
o.operator,
o.street_address,
o.city,
o.state,
o.postal_code,
o.country,
o.website,
o.phone,
o.tags,
o.matched_tags,
o.longitude,
o.latitude
from {OSM_TABLE} o
where not exists (
select 1 from _curated_to_osm m where m.osm_id = o.id
);
analyze {MASTER_TABLE};
return query
select
count(*) filter (where source = 'curated'),
count(*) filter (where source = 'merged'),
count(*) filter (where source = 'osm'),
count(*)
from {MASTER_TABLE};
end;
$$;
"""
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--radius-m",
type=float,
default=MATCH_RADIUS_M,
help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
)
parser.add_argument(
"--recreate",
action="store_true",
help=f"Drop and recreate {MASTER_TABLE} before building.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
if args.recreate:
cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
cur.execute(CREATE_TABLE_SQL)
cur.execute(REFRESH_FUNCTION_SQL)
cur.execute(
"select * from public.refresh_master_data_centers(%s)",
(args.radius_m,),
)
curated, merged, osm_only, total = cur.fetchone()
finally:
conn.close()
print(f"master_data_centers refreshed (radius={args.radius_m} m):")
print(f" curated-only rows: {curated}")
print(f" merged rows (curated + OSM): {merged}")
print(f" osm-only rows: {osm_only}")
print(f" total: {total}")
return 0
if __name__ == "__main__":
sys.exit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -14,7 +14,8 @@ from psycopg2.extras import execute_values
DB_NAME = "data_centers" DB_NAME = "data_centers"
POINT_TABLE = "public.us_dc_sample_geocoded" POINT_TABLE = "public.master_data_centers"
POINT_ID_COL = "master_id"
BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024" BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024" ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
FINAL_TABLE = "public.data_center_census_tracts_2024" FINAL_TABLE = "public.data_center_census_tracts_2024"
@@ -27,6 +28,25 @@ TRACT_ZIP_URL = (
) )
ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv") ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
STATE_NAME_TO_CODE = {
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
"District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
"Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
"Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
"Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
"Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
"Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
"New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
"Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
"South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
"Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
"West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
"American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
"Puerto Rico": "PR", "United States Virgin Islands": "VI",
"U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
}
STATE_FIPS = { STATE_FIPS = {
"AL": "01", "AL": "01",
"AK": "02", "AK": "02",
@@ -198,16 +218,45 @@ def connect():
) )
def normalize_state(value):
if value in (None, ""):
return None
if value in STATE_FIPS:
return value
return STATE_NAME_TO_CODE.get(value.strip())
def get_state_fips(conn): def get_state_fips(conn):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute( cur.execute(
f"select distinct state_code from {POINT_TABLE} order by state_code" f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
) )
state_codes = [row[0] for row in cur.fetchall()] rows = cur.fetchall()
missing = [code for code in state_codes if code not in STATE_FIPS] normalized_counts = {}
if missing: null_state_count = 0
raise RuntimeError(f"Missing state FIPS mappings for: {', '.join(missing)}") unknown = []
return [STATE_FIPS[code] for code in state_codes] for raw, count in rows:
if raw is None:
null_state_count += count
continue
code = normalize_state(raw)
if code is None:
unknown.append((raw, count))
continue
normalized_counts[code] = normalized_counts.get(code, 0) + count
if unknown:
details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
if null_state_count:
print(
f"warning: {null_state_count} master_data_centers rows have NULL state; "
f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
)
# Census ACS 5-year DP profile lacks coverage for the small island territories;
# restrict to the 50 states + DC + PR which the ACS profile reliably serves.
allowed = {"AS", "GU", "MP", "VI"}
return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
return sorted({STATE_FIPS[code] for code in normalized_counts})
def ensure_final_table_absent(conn): def ensure_final_table_absent(conn):
@@ -290,8 +339,20 @@ def fetch_acs_for_state(state_fips):
f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?" f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
+ urllib.parse.urlencode(params) + urllib.parse.urlencode(params)
) )
with urllib.request.urlopen(url, timeout=120) as response: try:
data = json.loads(response.read().decode("utf-8")) with urllib.request.urlopen(url, timeout=120) as response:
body = response.read().decode("utf-8")
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(
f"Census ACS request failed for state {state_fips}: HTTP {exc.code}{body[:300]}"
) from exc
try:
data = json.loads(body)
except json.JSONDecodeError as exc:
raise RuntimeError(
f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
) from exc
header = data[0] header = data[0]
rows = [] rows = []
@@ -444,12 +505,15 @@ def create_final_table(conn):
select select
t.geoid, t.geoid,
count(*)::integer as data_center_count, count(*)::integer as data_center_count,
count(*) filter (where dc.geocode_precision = 'address_range')::integer count(*) filter (where dc.source = 'curated')::integer
as address_range_data_center_count, as curated_only_data_center_count,
count(*) filter (where dc.geocode_precision = 'city')::integer count(*) filter (where dc.source = 'merged')::integer
as city_precision_data_center_count, as merged_data_center_count,
array_agg(dc.id order by dc.id) as data_center_ids, count(*) filter (where dc.source = 'osm')::integer
array_agg(distinct dc.provider order by dc.provider) as providers as osm_only_data_center_count,
array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
array_agg(distinct dc.operator) filter (where dc.operator is not null)
as operators
from {BOUNDARY_STAGE_TABLE} t from {BOUNDARY_STAGE_TABLE} t
join {POINT_TABLE} dc join {POINT_TABLE} dc
on t.geom && dc.geom on t.geom && dc.geom
@@ -469,10 +533,11 @@ def create_final_table(conn):
'{ACS_SOURCE}'::text as acs_source, '{ACS_SOURCE}'::text as acs_source,
a.acs_name, a.acs_name,
d.data_center_count, d.data_center_count,
d.address_range_data_center_count, d.curated_only_data_center_count,
d.city_precision_data_center_count, d.merged_data_center_count,
d.osm_only_data_center_count,
d.data_center_ids, d.data_center_ids,
d.providers, d.operators,
a.population, a.population,
a.median_age, a.median_age,
a.households, a.households,
@@ -532,7 +597,7 @@ def create_final_table(conn):
cur.execute( cur.execute(
f""" f"""
comment on table {FINAL_TABLE} is comment on table {FINAL_TABLE} is
'Census tracts containing records from public.us_dc_sample_geocoded, enriched with ACS 2024 5-year profile demographics and derived primary industry fields.' 'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
""" """
) )
cur.execute(f"analyze {FINAL_TABLE}") cur.execute(f"analyze {FINAL_TABLE}")
@@ -550,7 +615,7 @@ def assign_point_geoids(conn):
set geoid = matched.geoid set geoid = matched.geoid
from ( from (
select select
dc_inner.id, dc_inner.{POINT_ID_COL} as point_id,
( (
select t.geoid select t.geoid
from {BOUNDARY_STAGE_TABLE} t from {BOUNDARY_STAGE_TABLE} t
@@ -561,11 +626,11 @@ def assign_point_geoids(conn):
) as geoid ) as geoid
from {POINT_TABLE} dc_inner from {POINT_TABLE} dc_inner
) matched ) matched
where dc.id = matched.id where dc.{POINT_ID_COL} = matched.point_id
""" """
) )
cur.execute( cur.execute(
f"create index if not exists us_dc_sample_geocoded_geoid_idx on {POINT_TABLE} (geoid)" f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
) )
cur.execute(f"analyze {POINT_TABLE}") cur.execute(f"analyze {POINT_TABLE}")
@@ -586,13 +651,21 @@ def validate(conn):
total_points = cur.fetchone()[0] total_points = cur.fetchone()[0]
cur.execute( cur.execute(
f""" f"""
select geocode_precision, count(*)::integer select source, count(*)::integer
from {POINT_TABLE} from {POINT_TABLE}
group by geocode_precision group by source
order by geocode_precision order by source
""" """
) )
point_precision = cur.fetchall() point_source_breakdown = cur.fetchall()
cur.execute(
f"""
select count(*)::integer
from {POINT_TABLE}
where geoid is null
"""
)
unassigned_points = cur.fetchone()[0]
cur.execute( cur.execute(
f""" f"""
select count(*)::integer select count(*)::integer
@@ -601,7 +674,7 @@ def validate(conn):
""" """
) )
missing_acs = cur.fetchone()[0] missing_acs = cur.fetchone()[0]
return summary, total_points, point_precision, missing_acs return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
def main(): def main():
@@ -638,7 +711,7 @@ def main():
load_acs_stage(conn, acs_rows, acs_fieldnames) load_acs_stage(conn, acs_rows, acs_fieldnames)
create_final_table(conn) create_final_table(conn)
assign_point_geoids(conn) assign_point_geoids(conn)
summary, total_points, point_precision, missing_acs = validate(conn) summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
finally: finally:
conn.close() conn.close()
@@ -649,7 +722,8 @@ def main():
summary[0], summary[1], summary[2], total_points summary[0], summary[1], summary[2], total_points
) )
) )
print("point_precision=" + ", ".join(f"{k}:{v}" for k, v in point_precision)) print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
print(f"points_unassigned_to_tract={unassigned_points}")
print(f"tracts_missing_acs_population={missing_acs}") print(f"tracts_missing_acs_population={missing_acs}")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
Fetch US data centers from OpenStreetMap (Overpass API) and load them into
public.osm_data_centers in the data_centers database. Also (re)creates a
unioned view public.data_centers_union combining OSM + curated rows from
public.us_dc_sample_geocoded.
Two Overpass passes are made because tagging is inconsistent:
1) telecom=data_center
2) building=data_center
Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
in match_tags so we can see which query found each feature.
"""
import argparse
import json
import os
import sys
import time
from typing import Dict, List, Optional, Tuple
import psycopg2
import requests
from psycopg2.extras import Json, execute_values
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
TABLE = "public.osm_data_centers"
VIEW = "public.data_centers_union"
CURATED_TABLE = "public.us_dc_sample_geocoded"
DB_NAME = "data_centers"
# Tag passes: (key, value)
TAG_PASSES = [
("telecom", "data_center"),
("building", "data_center"),
]
def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
return f"""
[out:json][timeout:{timeout}];
area["ISO3166-1"="US"][admin_level=2]->.us;
(
node["{tag_key}"="{tag_value}"](area.us);
way["{tag_key}"="{tag_value}"](area.us);
relation["{tag_key}"="{tag_value}"](area.us);
);
out center tags;
""".strip()
def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
if cache_path and os.path.exists(cache_path):
print(f" using cached response: {cache_path}")
with open(cache_path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
else:
query = overpass_query(tag_key, tag_value)
print(f" querying Overpass for {tag_key}={tag_value} ...")
headers = {
"User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
"Accept": "application/json",
}
resp = requests.post(
OVERPASS_URL,
data={"data": query},
headers=headers,
timeout=240,
)
if resp.status_code != 200:
print(f" Overpass returned {resp.status_code}: {resp.text[:500]}")
resp.raise_for_status()
payload = resp.json()
if cache_path:
with open(cache_path, "w", encoding="utf-8") as fh:
json.dump(payload, fh)
print(f" cached to {cache_path}")
elements = payload.get("elements", [])
print(f" pass returned {len(elements)} elements")
return elements
def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
if elem.get("type") == "node":
return elem.get("lon"), elem.get("lat")
center = elem.get("center") or {}
return center.get("lon"), center.get("lat")
def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
lon, lat = element_coords(elem)
if lon is None or lat is None:
return None
osm_type = elem.get("type")
osm_id = elem.get("id")
if osm_type is None or osm_id is None:
return None
tags = elem.get("tags") or {}
return {
"id": f"{osm_type}/{osm_id}",
"osm_type": osm_type,
"osm_id": int(osm_id),
"name": tags.get("name"),
"operator": tags.get("operator"),
"operator_type": tags.get("operator:type"),
"telecom": tags.get("telecom"),
"building": tags.get("building"),
"power": tags.get("power"),
"website": tags.get("website") or tags.get("contact:website"),
"phone": tags.get("phone") or tags.get("contact:phone"),
"street_address": " ".join(
part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
) or None,
"city": tags.get("addr:city"),
"state": tags.get("addr:state"),
"postal_code": tags.get("addr:postcode"),
"country": tags.get("addr:country") or "US",
"matched_tags": [matched_tag],
"tags": tags,
"longitude": float(lon),
"latitude": float(lat),
}
def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
for row in new_rows:
key = row["id"]
prior = existing.get(key)
if prior is None:
existing[key] = row
continue
# merge matched_tags; keep first non-null values for other fields
merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
prior["matched_tags"] = merged_tags
for col, val in row.items():
if col == "matched_tags":
continue
if prior.get(col) in (None, "") and val not in (None, ""):
prior[col] = val
COLUMNS = [
"id",
"osm_type",
"osm_id",
"name",
"operator",
"operator_type",
"telecom",
"building",
"power",
"website",
"phone",
"street_address",
"city",
"state",
"postal_code",
"country",
"matched_tags",
"tags",
"longitude",
"latitude",
]
def row_to_tuple(row: dict) -> tuple:
return (
row["id"],
row["osm_type"],
row["osm_id"],
row.get("name"),
row.get("operator"),
row.get("operator_type"),
row.get("telecom"),
row.get("building"),
row.get("power"),
row.get("website"),
row.get("phone"),
row.get("street_address"),
row.get("city"),
row.get("state"),
row.get("postal_code"),
row.get("country"),
row.get("matched_tags", []),
Json(row.get("tags", {})),
row["longitude"],
row["latitude"],
)
def create_table(cur) -> None:
cur.execute(
f"""
create table {TABLE} (
id text primary key,
osm_type text not null,
osm_id bigint not null,
name text,
operator text,
operator_type text,
telecom text,
building text,
power text,
website text,
phone text,
street_address text,
city text,
state text,
postal_code text,
country text,
matched_tags text[] not null default '{{}}',
tags jsonb not null default '{{}}'::jsonb,
longitude double precision not null,
latitude double precision not null,
ingested_at timestamptz not null default now(),
geom geometry(Point, 4326) generated always as
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
)
"""
)
cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
def insert_values(cur, rows: List[dict], upsert: bool) -> None:
sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
if upsert:
update_cols = [c for c in COLUMNS if c != "id"]
assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
sql += (
f" on conflict (id) do update set {assignments}, "
f"ingested_at = now()"
)
execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
def create_or_replace_view(cur) -> None:
cur.execute(
f"""
create or replace view {VIEW} as
select
'curated/' || id as id,
'curated'::text as source,
facility_name as name,
provider as operator,
street_address,
city,
state_code as state,
postal_code,
country,
url as website,
phone,
longitude,
latitude,
geom
from {CURATED_TABLE}
union all
select
id,
'osm'::text as source,
name,
operator,
street_address,
city,
state,
postal_code,
country,
website,
phone,
longitude,
latitude,
geom
from {TABLE}
"""
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--cache-dir",
default="output",
help="Directory to cache raw Overpass responses (default: output/).",
)
parser.add_argument(
"--no-cache",
action="store_true",
help="Do not read or write Overpass cache files; always hit the API.",
)
parser.add_argument(
"--recreate",
action="store_true",
help=f"Drop and recreate {TABLE} before loading.",
)
parser.add_argument(
"--upsert",
action="store_true",
default=True,
help="On id conflicts, update the existing row (default: on).",
)
parser.add_argument(
"--skip-view",
action="store_true",
help=f"Do not create/replace the unioned view {VIEW}.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
os.makedirs(args.cache_dir, exist_ok=True)
merged: Dict[str, dict] = {}
for tag_key, tag_value in TAG_PASSES:
cache_path = (
None
if args.no_cache
else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
)
print(f"Pass: {tag_key}={tag_value}")
elements = fetch_pass(tag_key, tag_value, cache_path)
normalized = [
row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
if row is not None
]
print(f" normalized {len(normalized)} rows with coords")
merge_records(merged, normalized)
# be polite to Overpass between passes
time.sleep(2)
rows = list(merged.values())
print(f"Total deduped OSM data-center features: {len(rows)}")
if not rows:
print("No rows fetched; aborting DB load.", file=sys.stderr)
return 1
conn = psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname=DB_NAME,
)
try:
with conn:
with conn.cursor() as cur:
cur.execute("create extension if not exists postgis")
if args.recreate:
cur.execute(f"drop table if exists {TABLE} cascade")
cur.execute("select to_regclass(%s)", (TABLE,))
if cur.fetchone()[0] is None:
create_table(cur)
insert_values(cur, rows, upsert=args.upsert)
cur.execute(f"analyze {TABLE}")
if not args.skip_view:
cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
if cur.fetchone()[0] is not None:
create_or_replace_view(cur)
print(f"View {VIEW} (re)created.")
else:
print(
f"Skipping view: {CURATED_TABLE} does not exist.",
file=sys.stderr,
)
cur.execute(f"select count(*) from {TABLE}")
total = cur.fetchone()[0]
finally:
conn.close()
print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -8,7 +8,7 @@ import psycopg2
DB_NAME = "data_centers" DB_NAME = "data_centers"
POINT_TABLE = "public.us_dc_sample_geocoded" POINT_TABLE = "public.master_data_centers"
def connect(): def connect():
@@ -26,15 +26,17 @@ def load_points(conn):
cur.execute( cur.execute(
f""" f"""
select select
id, master_id,
coalesce(provider, '') as provider, source,
coalesce(facility_name, '') as facility_name, coalesce(operator, '') as operator,
coalesce(name, '') as name,
coalesce(city, '') as city, coalesce(city, '') as city,
coalesce(state_code, '') as state_code, coalesce(state, '') as state,
longitude, longitude,
latitude, latitude,
coalesce(geocode_source, '') as geocode_source, coalesce(curated_id, '') as curated_id,
coalesce(geocode_precision, '') as geocode_precision, coalesce(osm_id, '') as osm_id,
coalesce(match_method, '') as match_method,
coalesce(geoid, '') as geoid coalesce(geoid, '') as geoid
from {POINT_TABLE} from {POINT_TABLE}
where longitude is not null and latitude is not null where longitude is not null and latitude is not null
@@ -47,15 +49,17 @@ def load_points(conn):
points.append( points.append(
{ {
"id": row[0], "id": row[0],
"provider": row[1], "source": row[1],
"facility_name": row[2], "operator": row[2],
"city": row[3], "name": row[3],
"state_code": row[4], "city": row[4],
"lon": float(row[5]), "state": row[5],
"lat": float(row[6]), "lon": float(row[6]),
"geocode_source": row[7], "lat": float(row[7]),
"geocode_precision": row[8], "curated_id": row[8],
"geoid": row[9], "osm_id": row[9],
"match_method": row[10],
"geoid": row[11],
} }
) )
return points return points
@@ -70,12 +74,12 @@ def compute_center(points):
def build_stats(points): def build_stats(points):
by_source = Counter(p["geocode_source"] or "(blank)" for p in points) by_source = Counter(p["source"] or "(blank)" for p in points)
by_precision = Counter(p["geocode_precision"] or "(blank)" for p in points) by_match = Counter(p["match_method"] or "(none)" for p in points)
return { return {
"total": len(points), "total": len(points),
"by_source": dict(sorted(by_source.items(), key=lambda x: x[0])), "by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
"by_precision": dict(sorted(by_precision.items(), key=lambda x: x[0])), "by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
} }
@@ -89,7 +93,7 @@ def render_html(points, center_lat, center_lon, output_path):
<head> <head>
<meta charset=\"utf-8\" /> <meta charset=\"utf-8\" />
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" /> <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
<title>US Data Centers Map</title> <title>US Data Centers Master Map</title>
<link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" /> <link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
<style> <style>
html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }} html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
@@ -109,17 +113,17 @@ def render_html(points, center_lat, center_lon, output_path):
<body> <body>
<div id=\"layout\"> <div id=\"layout\">
<div id=\"panel\"> <div id=\"panel\">
<h1>US Data Centers</h1> <h1>US Data Centers (Master)</h1>
<div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div> <div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
<h2>Geocode Source</h2> <h2>Source</h2>
<div id=\"sourceStats\"></div> <div id=\"sourceStats\"></div>
<h2>Geocode Precision</h2> <h2>Match Method (merged rows)</h2>
<div id=\"precisionStats\"></div> <div id=\"matchStats\"></div>
<h2>Source Colors</h2> <h2>Source Colors</h2>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>IM3_Existing_DataCenters</span></div> <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>US Census Geocoder</span></div> <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>Nominatim/OpenStreetMap</span></div> <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>Other/Blank</span></div> <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
</div> </div>
<div id=\"map\"></div> <div id=\"map\"></div>
</div> </div>
@@ -130,9 +134,9 @@ def render_html(points, center_lat, center_lon, output_path):
const stats = {stats_json}; const stats = {stats_json};
function colorForSource(source) {{ function colorForSource(source) {{
if (source === 'IM3_Existing_DataCenters') return '#1f77b4'; if (source === 'merged') return '#2ca02c';
if (source === 'US Census Geocoder') return '#2ca02c'; if (source === 'curated') return '#1f77b4';
if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e'; if (source === 'osm') return '#ff7f0e';
return '#7f7f7f'; return '#7f7f7f';
}} }}
@@ -156,22 +160,26 @@ def render_html(points, center_lat, center_lon, output_path):
for (const p of points) {{ for (const p of points) {{
const marker = L.circleMarker([p.lat, p.lon], {{ const marker = L.circleMarker([p.lat, p.lon], {{
radius: 4, radius: 4,
color: colorForSource(p.geocode_source), color: colorForSource(p.source),
fillColor: colorForSource(p.geocode_source), fillColor: colorForSource(p.source),
fillOpacity: 0.7, fillOpacity: 0.7,
weight: 1 weight: 1
}}); }});
const title = p.facility_name || p.id; const title = p.name || p.id;
const provider = p.provider || '(unknown provider)'; const operator = p.operator || '(unknown operator)';
const cityState = [p.city, p.state_code].filter(Boolean).join(', '); const cityState = [p.city, p.state].filter(Boolean).join(', ');
const provenance = [
p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
].filter(Boolean).join('<br>');
marker.bindPopup(` marker.bindPopup(`
<strong>${{escapeHtml(title)}}</strong><br> <strong>${{escapeHtml(title)}}</strong><br>
Provider: ${{escapeHtml(provider)}}<br> Operator: ${{escapeHtml(operator)}}<br>
ID: ${{escapeHtml(p.id)}}<br>
Location: ${{escapeHtml(cityState)}}<br> Location: ${{escapeHtml(cityState)}}<br>
Source: ${{escapeHtml(p.geocode_source)}}<br> Source: ${{escapeHtml(p.source)}}<br>
Precision: ${{escapeHtml(p.geocode_precision)}}<br> ${{provenance ? provenance + '<br>' : ''}}
GEOID: ${{escapeHtml(p.geoid)}} GEOID: ${{escapeHtml(p.geoid)}}
`); `);
@@ -193,12 +201,12 @@ def render_html(points, center_lat, center_lon, output_path):
sourceStats.appendChild(div); sourceStats.appendChild(div);
}} }}
const precisionStats = document.getElementById('precisionStats'); const matchStats = document.getElementById('matchStats');
for (const [k, v] of Object.entries(stats.by_precision)) {{ for (const [k, v] of Object.entries(stats.by_match_method)) {{
const div = document.createElement('div'); const div = document.createElement('div');
div.className = 'stat-row'; div.className = 'stat-row';
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`; div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
precisionStats.appendChild(div); matchStats.appendChild(div);
}} }}
</script> </script>
</body> </body>

View File

@@ -10,7 +10,7 @@ import psycopg2
DB_NAME = "data_centers" DB_NAME = "data_centers"
DC_TABLE = "public.us_dc_sample_geocoded" DC_TABLE = "public.master_data_centers"
CABLES_TABLE = "public.internet_cables" CABLES_TABLE = "public.internet_cables"
CITY_TABLE = "public.internet_city_dominance" CITY_TABLE = "public.internet_city_dominance"
@@ -30,14 +30,14 @@ def load_data_centers(conn):
cur.execute( cur.execute(
f""" f"""
select select
id, master_id,
coalesce(provider, ''), source,
coalesce(facility_name, ''), coalesce(operator, ''),
coalesce(name, ''),
coalesce(city, ''), coalesce(city, ''),
coalesce(state_code, ''), coalesce(state, ''),
longitude, longitude,
latitude, latitude
coalesce(geocode_source, '')
from {DC_TABLE} from {DC_TABLE}
where longitude is not null and latitude is not null where longitude is not null and latitude is not null
""" """
@@ -45,13 +45,13 @@ def load_data_centers(conn):
return [ return [
{ {
"id": r[0], "id": r[0],
"provider": r[1], "source": r[1],
"facility_name": r[2], "operator": r[2],
"city": r[3], "name": r[3],
"state_code": r[4], "city": r[4],
"lon": float(r[5]), "state": r[5],
"lat": float(r[6]), "lon": float(r[6]),
"geocode_source": r[7], "lat": float(r[7]),
} }
for r in cur.fetchall() for r in cur.fetchall()
] ]
@@ -181,10 +181,10 @@ def render_html(data_centers, cables_geojson, cities, output_path):
<label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label> <label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
<h2>Data center source</h2> <h2>Data center source</h2>
<div class="row"><span><span class="swatch" style="background:#1f77b4"></span>IM3_Existing_DataCenters</span></div> <div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
<div class="row"><span><span class="swatch" style="background:#2ca02c"></span>US Census Geocoder</span></div> <div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
<div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>Nominatim/OpenStreetMap</span></div> <div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
<div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>Other</span></div> <div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
<h2>City dominance</h2> <h2>City dominance</h2>
<div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div> <div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
@@ -197,9 +197,9 @@ def render_html(data_centers, cables_geojson, cities, output_path):
const DATA = __PAYLOAD__; const DATA = __PAYLOAD__;
function colorForSource(source) { function colorForSource(source) {
if (source === 'IM3_Existing_DataCenters') return '#1f77b4'; if (source === 'merged') return '#2ca02c';
if (source === 'US Census Geocoder') return '#2ca02c'; if (source === 'curated') return '#1f77b4';
if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e'; if (source === 'osm') return '#ff7f0e';
return '#7f7f7f'; return '#7f7f7f';
} }
@@ -262,19 +262,19 @@ def render_html(data_centers, cables_geojson, cities, output_path):
for (const p of DATA.data_centers) { for (const p of DATA.data_centers) {
const m = L.circleMarker([p.lat, p.lon], { const m = L.circleMarker([p.lat, p.lon], {
radius: 3, radius: 3,
color: colorForSource(p.geocode_source), color: colorForSource(p.source),
fillColor: colorForSource(p.geocode_source), fillColor: colorForSource(p.source),
fillOpacity: 0.85, fillOpacity: 0.85,
weight: 0.8, weight: 0.8,
}); });
const title = p.facility_name || p.id; const title = p.name || p.id;
const provider = p.provider || '(unknown provider)'; const operator = p.operator || '(unknown operator)';
const cityState = [p.city, p.state_code].filter(Boolean).join(', '); const cityState = [p.city, p.state].filter(Boolean).join(', ');
m.bindPopup(` m.bindPopup(`
<strong>${esc(title)}</strong><br> <strong>${esc(title)}</strong><br>
Provider: ${esc(provider)}<br> Operator: ${esc(operator)}<br>
Location: ${esc(cityState)}<br> Location: ${esc(cityState)}<br>
Source: ${esc(p.geocode_source)} Source: ${esc(p.source)}
`); `);
dcLayer.addLayer(m); dcLayer.addLayer(m);
dcBounds.push([p.lat, p.lon]); dcBounds.push([p.lat, p.lon]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long