Add master data center merge workflow
This commit is contained in:
258
build_master_data_centers.py
Normal file
258
build_master_data_centers.py
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Build (or refresh) public.master_data_centers by merging:
|
||||||
|
- public.us_dc_sample_geocoded (curated, attribute-rich)
|
||||||
|
- public.osm_data_centers (OpenStreetMap features)
|
||||||
|
|
||||||
|
Deduplication rule (curated row wins):
|
||||||
|
Step 1: for each curated row, find a matching OSM row by
|
||||||
|
curated.id = osm.osm_id::text OR
|
||||||
|
curated.nominatim_osm_id = osm.osm_id OR
|
||||||
|
ST_DWithin(curated.geom, osm.geom, 150 m, geography)
|
||||||
|
(closest match by sphere distance when multiple).
|
||||||
|
Step 2: insert every curated row into master, filling NULLs from the
|
||||||
|
matched OSM row when present. source = 'merged' if matched,
|
||||||
|
otherwise 'curated'.
|
||||||
|
Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
|
||||||
|
source = 'osm'.
|
||||||
|
|
||||||
|
Result: every curated row appears once; OSM-only rows appear once; no row is
|
||||||
|
emitted twice. The merge logic lives in a SQL function
|
||||||
|
public.refresh_master_data_centers() so subsequent refreshes are one call.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
DB_NAME = "data_centers"
|
||||||
|
MASTER_TABLE = "public.master_data_centers"
|
||||||
|
CURATED_TABLE = "public.us_dc_sample_geocoded"
|
||||||
|
OSM_TABLE = "public.osm_data_centers"
|
||||||
|
MATCH_RADIUS_M = 150
|
||||||
|
|
||||||
|
|
||||||
|
CREATE_TABLE_SQL = f"""
|
||||||
|
create table if not exists {MASTER_TABLE} (
|
||||||
|
master_id text primary key,
|
||||||
|
source text not null check (source in ('curated','osm','merged')),
|
||||||
|
curated_id text,
|
||||||
|
osm_id text,
|
||||||
|
name text,
|
||||||
|
operator text,
|
||||||
|
street_address text,
|
||||||
|
city text,
|
||||||
|
state text,
|
||||||
|
postal_code text,
|
||||||
|
country text,
|
||||||
|
website text,
|
||||||
|
phone text,
|
||||||
|
power_mw numeric,
|
||||||
|
area_sqft integer,
|
||||||
|
nearest_airport_miles numeric,
|
||||||
|
has_bare_metal boolean,
|
||||||
|
has_iaas boolean,
|
||||||
|
has_internet_exchange boolean,
|
||||||
|
has_colocation boolean,
|
||||||
|
certifications text,
|
||||||
|
content_summary text,
|
||||||
|
osm_tags jsonb,
|
||||||
|
matched_osm_tag_passes text[],
|
||||||
|
match_method text,
|
||||||
|
match_distance_m numeric,
|
||||||
|
longitude double precision not null,
|
||||||
|
latitude double precision not null,
|
||||||
|
geom geometry(Point, 4326)
|
||||||
|
generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||||
|
);
|
||||||
|
create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
|
||||||
|
create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
|
||||||
|
create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
|
||||||
|
create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
|
||||||
|
create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
REFRESH_FUNCTION_SQL = f"""
|
||||||
|
create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
|
||||||
|
returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
|
||||||
|
language plpgsql
|
||||||
|
as $$
|
||||||
|
begin
|
||||||
|
truncate table {MASTER_TABLE};
|
||||||
|
|
||||||
|
-- pick a single best OSM match for each curated row, prioritizing ID
|
||||||
|
-- equality, then nominatim id, then closest within radius
|
||||||
|
create temporary table _curated_to_osm on commit drop as
|
||||||
|
with ranked as (
|
||||||
|
select
|
||||||
|
c.id as curated_id,
|
||||||
|
o.id as osm_id,
|
||||||
|
case
|
||||||
|
when c.id = o.osm_id::text then 'id'
|
||||||
|
when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
|
||||||
|
else 'spatial'
|
||||||
|
end as method,
|
||||||
|
ST_DistanceSphere(c.geom, o.geom) as dist_m,
|
||||||
|
row_number() over (
|
||||||
|
partition by c.id
|
||||||
|
order by
|
||||||
|
case
|
||||||
|
when c.id = o.osm_id::text then 0
|
||||||
|
when c.nominatim_osm_id = o.osm_id then 1
|
||||||
|
else 2
|
||||||
|
end,
|
||||||
|
ST_DistanceSphere(c.geom, o.geom) asc
|
||||||
|
) as rn
|
||||||
|
from {CURATED_TABLE} c
|
||||||
|
join {OSM_TABLE} o
|
||||||
|
on c.id = o.osm_id::text
|
||||||
|
or c.nominatim_osm_id = o.osm_id
|
||||||
|
or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
|
||||||
|
)
|
||||||
|
select curated_id, osm_id, method, dist_m
|
||||||
|
from ranked
|
||||||
|
where rn = 1;
|
||||||
|
|
||||||
|
-- Step 1+2: insert curated rows (with OSM nulls filled where matched)
|
||||||
|
insert into {MASTER_TABLE} (
|
||||||
|
master_id, source, curated_id, osm_id,
|
||||||
|
name, operator, street_address, city, state, postal_code, country,
|
||||||
|
website, phone, power_mw, area_sqft, nearest_airport_miles,
|
||||||
|
has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
|
||||||
|
certifications, content_summary,
|
||||||
|
osm_tags, matched_osm_tag_passes,
|
||||||
|
match_method, match_distance_m,
|
||||||
|
longitude, latitude
|
||||||
|
)
|
||||||
|
select
|
||||||
|
'curated/' || c.id,
|
||||||
|
case when m.osm_id is not null then 'merged' else 'curated' end,
|
||||||
|
c.id,
|
||||||
|
m.osm_id,
|
||||||
|
coalesce(c.facility_name, o.name),
|
||||||
|
coalesce(c.provider, o.operator),
|
||||||
|
coalesce(c.street_address, o.street_address),
|
||||||
|
coalesce(c.city, o.city),
|
||||||
|
coalesce(c.state_code, o.state),
|
||||||
|
coalesce(c.postal_code, o.postal_code),
|
||||||
|
coalesce(c.country, o.country),
|
||||||
|
coalesce(c.url, o.website),
|
||||||
|
coalesce(c.phone, o.phone),
|
||||||
|
c.power_mw,
|
||||||
|
c.area_sqft,
|
||||||
|
c.nearest_airport_miles,
|
||||||
|
c.has_bare_metal,
|
||||||
|
c.has_iaas,
|
||||||
|
c.has_internet_exchange,
|
||||||
|
c.has_colocation,
|
||||||
|
c.certifications,
|
||||||
|
c.content_summary,
|
||||||
|
o.tags,
|
||||||
|
o.matched_tags,
|
||||||
|
m.method,
|
||||||
|
round(m.dist_m::numeric, 2),
|
||||||
|
c.longitude,
|
||||||
|
c.latitude
|
||||||
|
from {CURATED_TABLE} c
|
||||||
|
left join _curated_to_osm m on m.curated_id = c.id
|
||||||
|
left join {OSM_TABLE} o on o.id = m.osm_id;
|
||||||
|
|
||||||
|
-- Step 3: insert OSM rows that no curated row claimed
|
||||||
|
insert into {MASTER_TABLE} (
|
||||||
|
master_id, source, curated_id, osm_id,
|
||||||
|
name, operator, street_address, city, state, postal_code, country,
|
||||||
|
website, phone,
|
||||||
|
osm_tags, matched_osm_tag_passes,
|
||||||
|
longitude, latitude
|
||||||
|
)
|
||||||
|
select
|
||||||
|
'osm/' || o.id,
|
||||||
|
'osm',
|
||||||
|
null,
|
||||||
|
o.id,
|
||||||
|
o.name,
|
||||||
|
o.operator,
|
||||||
|
o.street_address,
|
||||||
|
o.city,
|
||||||
|
o.state,
|
||||||
|
o.postal_code,
|
||||||
|
o.country,
|
||||||
|
o.website,
|
||||||
|
o.phone,
|
||||||
|
o.tags,
|
||||||
|
o.matched_tags,
|
||||||
|
o.longitude,
|
||||||
|
o.latitude
|
||||||
|
from {OSM_TABLE} o
|
||||||
|
where not exists (
|
||||||
|
select 1 from _curated_to_osm m where m.osm_id = o.id
|
||||||
|
);
|
||||||
|
|
||||||
|
analyze {MASTER_TABLE};
|
||||||
|
|
||||||
|
return query
|
||||||
|
select
|
||||||
|
count(*) filter (where source = 'curated'),
|
||||||
|
count(*) filter (where source = 'merged'),
|
||||||
|
count(*) filter (where source = 'osm'),
|
||||||
|
count(*)
|
||||||
|
from {MASTER_TABLE};
|
||||||
|
end;
|
||||||
|
$$;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--radius-m",
|
||||||
|
type=float,
|
||||||
|
default=MATCH_RADIUS_M,
|
||||||
|
help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--recreate",
|
||||||
|
action="store_true",
|
||||||
|
help=f"Drop and recreate {MASTER_TABLE} before building.",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=os.environ["PGWEB_HOST"],
|
||||||
|
port=os.environ["PGWEB_PORT"],
|
||||||
|
user=os.environ["PGWEB_USER"],
|
||||||
|
password=os.environ["PGWEB_PASSWORD"],
|
||||||
|
dbname=DB_NAME,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("create extension if not exists postgis")
|
||||||
|
if args.recreate:
|
||||||
|
cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
|
||||||
|
cur.execute(CREATE_TABLE_SQL)
|
||||||
|
cur.execute(REFRESH_FUNCTION_SQL)
|
||||||
|
cur.execute(
|
||||||
|
"select * from public.refresh_master_data_centers(%s)",
|
||||||
|
(args.radius_m,),
|
||||||
|
)
|
||||||
|
curated, merged, osm_only, total = cur.fetchone()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print(f"master_data_centers refreshed (radius={args.radius_m} m):")
|
||||||
|
print(f" curated-only rows: {curated}")
|
||||||
|
print(f" merged rows (curated + OSM): {merged}")
|
||||||
|
print(f" osm-only rows: {osm_only}")
|
||||||
|
print(f" total: {total}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -14,7 +14,8 @@ from psycopg2.extras import execute_values
|
|||||||
|
|
||||||
|
|
||||||
DB_NAME = "data_centers"
|
DB_NAME = "data_centers"
|
||||||
POINT_TABLE = "public.us_dc_sample_geocoded"
|
POINT_TABLE = "public.master_data_centers"
|
||||||
|
POINT_ID_COL = "master_id"
|
||||||
BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
|
BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
|
||||||
ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
|
ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
|
||||||
FINAL_TABLE = "public.data_center_census_tracts_2024"
|
FINAL_TABLE = "public.data_center_census_tracts_2024"
|
||||||
@@ -27,6 +28,25 @@ TRACT_ZIP_URL = (
|
|||||||
)
|
)
|
||||||
ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
|
ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
|
||||||
|
|
||||||
|
STATE_NAME_TO_CODE = {
|
||||||
|
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
|
||||||
|
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
|
||||||
|
"District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
|
||||||
|
"Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
|
||||||
|
"Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
|
||||||
|
"Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
|
||||||
|
"Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
|
||||||
|
"Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
|
||||||
|
"New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
|
||||||
|
"Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
|
||||||
|
"South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
|
||||||
|
"Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
|
||||||
|
"West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
|
||||||
|
"American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
|
||||||
|
"Puerto Rico": "PR", "United States Virgin Islands": "VI",
|
||||||
|
"U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
|
||||||
|
}
|
||||||
|
|
||||||
STATE_FIPS = {
|
STATE_FIPS = {
|
||||||
"AL": "01",
|
"AL": "01",
|
||||||
"AK": "02",
|
"AK": "02",
|
||||||
@@ -198,16 +218,45 @@ def connect():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_state(value):
|
||||||
|
if value in (None, ""):
|
||||||
|
return None
|
||||||
|
if value in STATE_FIPS:
|
||||||
|
return value
|
||||||
|
return STATE_NAME_TO_CODE.get(value.strip())
|
||||||
|
|
||||||
|
|
||||||
def get_state_fips(conn):
|
def get_state_fips(conn):
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
f"select distinct state_code from {POINT_TABLE} order by state_code"
|
f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
|
||||||
)
|
)
|
||||||
state_codes = [row[0] for row in cur.fetchall()]
|
rows = cur.fetchall()
|
||||||
missing = [code for code in state_codes if code not in STATE_FIPS]
|
normalized_counts = {}
|
||||||
if missing:
|
null_state_count = 0
|
||||||
raise RuntimeError(f"Missing state FIPS mappings for: {', '.join(missing)}")
|
unknown = []
|
||||||
return [STATE_FIPS[code] for code in state_codes]
|
for raw, count in rows:
|
||||||
|
if raw is None:
|
||||||
|
null_state_count += count
|
||||||
|
continue
|
||||||
|
code = normalize_state(raw)
|
||||||
|
if code is None:
|
||||||
|
unknown.append((raw, count))
|
||||||
|
continue
|
||||||
|
normalized_counts[code] = normalized_counts.get(code, 0) + count
|
||||||
|
if unknown:
|
||||||
|
details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
|
||||||
|
raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
|
||||||
|
if null_state_count:
|
||||||
|
print(
|
||||||
|
f"warning: {null_state_count} master_data_centers rows have NULL state; "
|
||||||
|
f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
|
||||||
|
)
|
||||||
|
# Census ACS 5-year DP profile lacks coverage for the small island territories;
|
||||||
|
# restrict to the 50 states + DC + PR which the ACS profile reliably serves.
|
||||||
|
allowed = {"AS", "GU", "MP", "VI"}
|
||||||
|
return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
|
||||||
|
return sorted({STATE_FIPS[code] for code in normalized_counts})
|
||||||
|
|
||||||
|
|
||||||
def ensure_final_table_absent(conn):
|
def ensure_final_table_absent(conn):
|
||||||
@@ -290,8 +339,20 @@ def fetch_acs_for_state(state_fips):
|
|||||||
f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
|
f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
|
||||||
+ urllib.parse.urlencode(params)
|
+ urllib.parse.urlencode(params)
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
with urllib.request.urlopen(url, timeout=120) as response:
|
with urllib.request.urlopen(url, timeout=120) as response:
|
||||||
data = json.loads(response.read().decode("utf-8"))
|
body = response.read().decode("utf-8")
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
body = exc.read().decode("utf-8", errors="replace")
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Census ACS request failed for state {state_fips}: HTTP {exc.code} — {body[:300]}"
|
||||||
|
) from exc
|
||||||
|
try:
|
||||||
|
data = json.loads(body)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
header = data[0]
|
header = data[0]
|
||||||
rows = []
|
rows = []
|
||||||
@@ -444,12 +505,15 @@ def create_final_table(conn):
|
|||||||
select
|
select
|
||||||
t.geoid,
|
t.geoid,
|
||||||
count(*)::integer as data_center_count,
|
count(*)::integer as data_center_count,
|
||||||
count(*) filter (where dc.geocode_precision = 'address_range')::integer
|
count(*) filter (where dc.source = 'curated')::integer
|
||||||
as address_range_data_center_count,
|
as curated_only_data_center_count,
|
||||||
count(*) filter (where dc.geocode_precision = 'city')::integer
|
count(*) filter (where dc.source = 'merged')::integer
|
||||||
as city_precision_data_center_count,
|
as merged_data_center_count,
|
||||||
array_agg(dc.id order by dc.id) as data_center_ids,
|
count(*) filter (where dc.source = 'osm')::integer
|
||||||
array_agg(distinct dc.provider order by dc.provider) as providers
|
as osm_only_data_center_count,
|
||||||
|
array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
|
||||||
|
array_agg(distinct dc.operator) filter (where dc.operator is not null)
|
||||||
|
as operators
|
||||||
from {BOUNDARY_STAGE_TABLE} t
|
from {BOUNDARY_STAGE_TABLE} t
|
||||||
join {POINT_TABLE} dc
|
join {POINT_TABLE} dc
|
||||||
on t.geom && dc.geom
|
on t.geom && dc.geom
|
||||||
@@ -469,10 +533,11 @@ def create_final_table(conn):
|
|||||||
'{ACS_SOURCE}'::text as acs_source,
|
'{ACS_SOURCE}'::text as acs_source,
|
||||||
a.acs_name,
|
a.acs_name,
|
||||||
d.data_center_count,
|
d.data_center_count,
|
||||||
d.address_range_data_center_count,
|
d.curated_only_data_center_count,
|
||||||
d.city_precision_data_center_count,
|
d.merged_data_center_count,
|
||||||
|
d.osm_only_data_center_count,
|
||||||
d.data_center_ids,
|
d.data_center_ids,
|
||||||
d.providers,
|
d.operators,
|
||||||
a.population,
|
a.population,
|
||||||
a.median_age,
|
a.median_age,
|
||||||
a.households,
|
a.households,
|
||||||
@@ -532,7 +597,7 @@ def create_final_table(conn):
|
|||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
comment on table {FINAL_TABLE} is
|
comment on table {FINAL_TABLE} is
|
||||||
'Census tracts containing records from public.us_dc_sample_geocoded, enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
|
'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
cur.execute(f"analyze {FINAL_TABLE}")
|
cur.execute(f"analyze {FINAL_TABLE}")
|
||||||
@@ -550,7 +615,7 @@ def assign_point_geoids(conn):
|
|||||||
set geoid = matched.geoid
|
set geoid = matched.geoid
|
||||||
from (
|
from (
|
||||||
select
|
select
|
||||||
dc_inner.id,
|
dc_inner.{POINT_ID_COL} as point_id,
|
||||||
(
|
(
|
||||||
select t.geoid
|
select t.geoid
|
||||||
from {BOUNDARY_STAGE_TABLE} t
|
from {BOUNDARY_STAGE_TABLE} t
|
||||||
@@ -561,11 +626,11 @@ def assign_point_geoids(conn):
|
|||||||
) as geoid
|
) as geoid
|
||||||
from {POINT_TABLE} dc_inner
|
from {POINT_TABLE} dc_inner
|
||||||
) matched
|
) matched
|
||||||
where dc.id = matched.id
|
where dc.{POINT_ID_COL} = matched.point_id
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
cur.execute(
|
cur.execute(
|
||||||
f"create index if not exists us_dc_sample_geocoded_geoid_idx on {POINT_TABLE} (geoid)"
|
f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
|
||||||
)
|
)
|
||||||
cur.execute(f"analyze {POINT_TABLE}")
|
cur.execute(f"analyze {POINT_TABLE}")
|
||||||
|
|
||||||
@@ -586,13 +651,21 @@ def validate(conn):
|
|||||||
total_points = cur.fetchone()[0]
|
total_points = cur.fetchone()[0]
|
||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
select geocode_precision, count(*)::integer
|
select source, count(*)::integer
|
||||||
from {POINT_TABLE}
|
from {POINT_TABLE}
|
||||||
group by geocode_precision
|
group by source
|
||||||
order by geocode_precision
|
order by source
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
point_precision = cur.fetchall()
|
point_source_breakdown = cur.fetchall()
|
||||||
|
cur.execute(
|
||||||
|
f"""
|
||||||
|
select count(*)::integer
|
||||||
|
from {POINT_TABLE}
|
||||||
|
where geoid is null
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
unassigned_points = cur.fetchone()[0]
|
||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
select count(*)::integer
|
select count(*)::integer
|
||||||
@@ -601,7 +674,7 @@ def validate(conn):
|
|||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
missing_acs = cur.fetchone()[0]
|
missing_acs = cur.fetchone()[0]
|
||||||
return summary, total_points, point_precision, missing_acs
|
return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -638,7 +711,7 @@ def main():
|
|||||||
load_acs_stage(conn, acs_rows, acs_fieldnames)
|
load_acs_stage(conn, acs_rows, acs_fieldnames)
|
||||||
create_final_table(conn)
|
create_final_table(conn)
|
||||||
assign_point_geoids(conn)
|
assign_point_geoids(conn)
|
||||||
summary, total_points, point_precision, missing_acs = validate(conn)
|
summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -649,7 +722,8 @@ def main():
|
|||||||
summary[0], summary[1], summary[2], total_points
|
summary[0], summary[1], summary[2], total_points
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
print("point_precision=" + ", ".join(f"{k}:{v}" for k, v in point_precision))
|
print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
|
||||||
|
print(f"points_unassigned_to_tract={unassigned_points}")
|
||||||
print(f"tracts_missing_acs_population={missing_acs}")
|
print(f"tracts_missing_acs_population={missing_acs}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
376
load_postgis_osm_data_centers.py
Normal file
376
load_postgis_osm_data_centers.py
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fetch US data centers from OpenStreetMap (Overpass API) and load them into
|
||||||
|
public.osm_data_centers in the data_centers database. Also (re)creates a
|
||||||
|
unioned view public.data_centers_union combining OSM + curated rows from
|
||||||
|
public.us_dc_sample_geocoded.
|
||||||
|
|
||||||
|
Two Overpass passes are made because tagging is inconsistent:
|
||||||
|
1) telecom=data_center
|
||||||
|
2) building=data_center
|
||||||
|
|
||||||
|
Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
|
||||||
|
in match_tags so we can see which query found each feature.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
import requests
|
||||||
|
from psycopg2.extras import Json, execute_values
|
||||||
|
|
||||||
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
||||||
|
TABLE = "public.osm_data_centers"
|
||||||
|
VIEW = "public.data_centers_union"
|
||||||
|
CURATED_TABLE = "public.us_dc_sample_geocoded"
|
||||||
|
DB_NAME = "data_centers"
|
||||||
|
|
||||||
|
# Tag passes: (key, value)
|
||||||
|
TAG_PASSES = [
|
||||||
|
("telecom", "data_center"),
|
||||||
|
("building", "data_center"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
|
||||||
|
return f"""
|
||||||
|
[out:json][timeout:{timeout}];
|
||||||
|
area["ISO3166-1"="US"][admin_level=2]->.us;
|
||||||
|
(
|
||||||
|
node["{tag_key}"="{tag_value}"](area.us);
|
||||||
|
way["{tag_key}"="{tag_value}"](area.us);
|
||||||
|
relation["{tag_key}"="{tag_value}"](area.us);
|
||||||
|
);
|
||||||
|
out center tags;
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
|
||||||
|
if cache_path and os.path.exists(cache_path):
|
||||||
|
print(f" using cached response: {cache_path}")
|
||||||
|
with open(cache_path, "r", encoding="utf-8") as fh:
|
||||||
|
payload = json.load(fh)
|
||||||
|
else:
|
||||||
|
query = overpass_query(tag_key, tag_value)
|
||||||
|
print(f" querying Overpass for {tag_key}={tag_value} ...")
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
|
||||||
|
"Accept": "application/json",
|
||||||
|
}
|
||||||
|
resp = requests.post(
|
||||||
|
OVERPASS_URL,
|
||||||
|
data={"data": query},
|
||||||
|
headers=headers,
|
||||||
|
timeout=240,
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
print(f" Overpass returned {resp.status_code}: {resp.text[:500]}")
|
||||||
|
resp.raise_for_status()
|
||||||
|
payload = resp.json()
|
||||||
|
if cache_path:
|
||||||
|
with open(cache_path, "w", encoding="utf-8") as fh:
|
||||||
|
json.dump(payload, fh)
|
||||||
|
print(f" cached to {cache_path}")
|
||||||
|
elements = payload.get("elements", [])
|
||||||
|
print(f" pass returned {len(elements)} elements")
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
|
||||||
|
if elem.get("type") == "node":
|
||||||
|
return elem.get("lon"), elem.get("lat")
|
||||||
|
center = elem.get("center") or {}
|
||||||
|
return center.get("lon"), center.get("lat")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
|
||||||
|
lon, lat = element_coords(elem)
|
||||||
|
if lon is None or lat is None:
|
||||||
|
return None
|
||||||
|
osm_type = elem.get("type")
|
||||||
|
osm_id = elem.get("id")
|
||||||
|
if osm_type is None or osm_id is None:
|
||||||
|
return None
|
||||||
|
tags = elem.get("tags") or {}
|
||||||
|
return {
|
||||||
|
"id": f"{osm_type}/{osm_id}",
|
||||||
|
"osm_type": osm_type,
|
||||||
|
"osm_id": int(osm_id),
|
||||||
|
"name": tags.get("name"),
|
||||||
|
"operator": tags.get("operator"),
|
||||||
|
"operator_type": tags.get("operator:type"),
|
||||||
|
"telecom": tags.get("telecom"),
|
||||||
|
"building": tags.get("building"),
|
||||||
|
"power": tags.get("power"),
|
||||||
|
"website": tags.get("website") or tags.get("contact:website"),
|
||||||
|
"phone": tags.get("phone") or tags.get("contact:phone"),
|
||||||
|
"street_address": " ".join(
|
||||||
|
part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
|
||||||
|
) or None,
|
||||||
|
"city": tags.get("addr:city"),
|
||||||
|
"state": tags.get("addr:state"),
|
||||||
|
"postal_code": tags.get("addr:postcode"),
|
||||||
|
"country": tags.get("addr:country") or "US",
|
||||||
|
"matched_tags": [matched_tag],
|
||||||
|
"tags": tags,
|
||||||
|
"longitude": float(lon),
|
||||||
|
"latitude": float(lat),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
|
||||||
|
for row in new_rows:
|
||||||
|
key = row["id"]
|
||||||
|
prior = existing.get(key)
|
||||||
|
if prior is None:
|
||||||
|
existing[key] = row
|
||||||
|
continue
|
||||||
|
# merge matched_tags; keep first non-null values for other fields
|
||||||
|
merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
|
||||||
|
prior["matched_tags"] = merged_tags
|
||||||
|
for col, val in row.items():
|
||||||
|
if col == "matched_tags":
|
||||||
|
continue
|
||||||
|
if prior.get(col) in (None, "") and val not in (None, ""):
|
||||||
|
prior[col] = val
|
||||||
|
|
||||||
|
|
||||||
|
COLUMNS = [
|
||||||
|
"id",
|
||||||
|
"osm_type",
|
||||||
|
"osm_id",
|
||||||
|
"name",
|
||||||
|
"operator",
|
||||||
|
"operator_type",
|
||||||
|
"telecom",
|
||||||
|
"building",
|
||||||
|
"power",
|
||||||
|
"website",
|
||||||
|
"phone",
|
||||||
|
"street_address",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"postal_code",
|
||||||
|
"country",
|
||||||
|
"matched_tags",
|
||||||
|
"tags",
|
||||||
|
"longitude",
|
||||||
|
"latitude",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def row_to_tuple(row: dict) -> tuple:
|
||||||
|
return (
|
||||||
|
row["id"],
|
||||||
|
row["osm_type"],
|
||||||
|
row["osm_id"],
|
||||||
|
row.get("name"),
|
||||||
|
row.get("operator"),
|
||||||
|
row.get("operator_type"),
|
||||||
|
row.get("telecom"),
|
||||||
|
row.get("building"),
|
||||||
|
row.get("power"),
|
||||||
|
row.get("website"),
|
||||||
|
row.get("phone"),
|
||||||
|
row.get("street_address"),
|
||||||
|
row.get("city"),
|
||||||
|
row.get("state"),
|
||||||
|
row.get("postal_code"),
|
||||||
|
row.get("country"),
|
||||||
|
row.get("matched_tags", []),
|
||||||
|
Json(row.get("tags", {})),
|
||||||
|
row["longitude"],
|
||||||
|
row["latitude"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_table(cur) -> None:
|
||||||
|
cur.execute(
|
||||||
|
f"""
|
||||||
|
create table {TABLE} (
|
||||||
|
id text primary key,
|
||||||
|
osm_type text not null,
|
||||||
|
osm_id bigint not null,
|
||||||
|
name text,
|
||||||
|
operator text,
|
||||||
|
operator_type text,
|
||||||
|
telecom text,
|
||||||
|
building text,
|
||||||
|
power text,
|
||||||
|
website text,
|
||||||
|
phone text,
|
||||||
|
street_address text,
|
||||||
|
city text,
|
||||||
|
state text,
|
||||||
|
postal_code text,
|
||||||
|
country text,
|
||||||
|
matched_tags text[] not null default '{{}}',
|
||||||
|
tags jsonb not null default '{{}}'::jsonb,
|
||||||
|
longitude double precision not null,
|
||||||
|
latitude double precision not null,
|
||||||
|
ingested_at timestamptz not null default now(),
|
||||||
|
geom geometry(Point, 4326) generated always as
|
||||||
|
(ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
|
||||||
|
cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
|
||||||
|
cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
|
||||||
|
|
||||||
|
|
||||||
|
def insert_values(cur, rows: List[dict], upsert: bool) -> None:
|
||||||
|
sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
|
||||||
|
if upsert:
|
||||||
|
update_cols = [c for c in COLUMNS if c != "id"]
|
||||||
|
assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
|
||||||
|
sql += (
|
||||||
|
f" on conflict (id) do update set {assignments}, "
|
||||||
|
f"ingested_at = now()"
|
||||||
|
)
|
||||||
|
execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
|
||||||
|
|
||||||
|
|
||||||
|
def create_or_replace_view(cur) -> None:
|
||||||
|
cur.execute(
|
||||||
|
f"""
|
||||||
|
create or replace view {VIEW} as
|
||||||
|
select
|
||||||
|
'curated/' || id as id,
|
||||||
|
'curated'::text as source,
|
||||||
|
facility_name as name,
|
||||||
|
provider as operator,
|
||||||
|
street_address,
|
||||||
|
city,
|
||||||
|
state_code as state,
|
||||||
|
postal_code,
|
||||||
|
country,
|
||||||
|
url as website,
|
||||||
|
phone,
|
||||||
|
longitude,
|
||||||
|
latitude,
|
||||||
|
geom
|
||||||
|
from {CURATED_TABLE}
|
||||||
|
union all
|
||||||
|
select
|
||||||
|
id,
|
||||||
|
'osm'::text as source,
|
||||||
|
name,
|
||||||
|
operator,
|
||||||
|
street_address,
|
||||||
|
city,
|
||||||
|
state,
|
||||||
|
postal_code,
|
||||||
|
country,
|
||||||
|
website,
|
||||||
|
phone,
|
||||||
|
longitude,
|
||||||
|
latitude,
|
||||||
|
geom
|
||||||
|
from {TABLE}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cache-dir",
|
||||||
|
default="output",
|
||||||
|
help="Directory to cache raw Overpass responses (default: output/).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-cache",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not read or write Overpass cache files; always hit the API.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--recreate",
|
||||||
|
action="store_true",
|
||||||
|
help=f"Drop and recreate {TABLE} before loading.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--upsert",
|
||||||
|
action="store_true",
|
||||||
|
default=True,
|
||||||
|
help="On id conflicts, update the existing row (default: on).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-view",
|
||||||
|
action="store_true",
|
||||||
|
help=f"Do not create/replace the unioned view {VIEW}.",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
os.makedirs(args.cache_dir, exist_ok=True)
|
||||||
|
merged: Dict[str, dict] = {}
|
||||||
|
for tag_key, tag_value in TAG_PASSES:
|
||||||
|
cache_path = (
|
||||||
|
None
|
||||||
|
if args.no_cache
|
||||||
|
else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
|
||||||
|
)
|
||||||
|
print(f"Pass: {tag_key}={tag_value}")
|
||||||
|
elements = fetch_pass(tag_key, tag_value, cache_path)
|
||||||
|
normalized = [
|
||||||
|
row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
|
||||||
|
if row is not None
|
||||||
|
]
|
||||||
|
print(f" normalized {len(normalized)} rows with coords")
|
||||||
|
merge_records(merged, normalized)
|
||||||
|
# be polite to Overpass between passes
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
rows = list(merged.values())
|
||||||
|
print(f"Total deduped OSM data-center features: {len(rows)}")
|
||||||
|
if not rows:
|
||||||
|
print("No rows fetched; aborting DB load.", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=os.environ["PGWEB_HOST"],
|
||||||
|
port=os.environ["PGWEB_PORT"],
|
||||||
|
user=os.environ["PGWEB_USER"],
|
||||||
|
password=os.environ["PGWEB_PASSWORD"],
|
||||||
|
dbname=DB_NAME,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("create extension if not exists postgis")
|
||||||
|
if args.recreate:
|
||||||
|
cur.execute(f"drop table if exists {TABLE} cascade")
|
||||||
|
cur.execute("select to_regclass(%s)", (TABLE,))
|
||||||
|
if cur.fetchone()[0] is None:
|
||||||
|
create_table(cur)
|
||||||
|
insert_values(cur, rows, upsert=args.upsert)
|
||||||
|
cur.execute(f"analyze {TABLE}")
|
||||||
|
if not args.skip_view:
|
||||||
|
cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
|
||||||
|
if cur.fetchone()[0] is not None:
|
||||||
|
create_or_replace_view(cur)
|
||||||
|
print(f"View {VIEW} (re)created.")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"Skipping view: {CURATED_TABLE} does not exist.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
cur.execute(f"select count(*) from {TABLE}")
|
||||||
|
total = cur.fetchone()[0]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -8,7 +8,7 @@ import psycopg2
|
|||||||
|
|
||||||
|
|
||||||
DB_NAME = "data_centers"
|
DB_NAME = "data_centers"
|
||||||
POINT_TABLE = "public.us_dc_sample_geocoded"
|
POINT_TABLE = "public.master_data_centers"
|
||||||
|
|
||||||
|
|
||||||
def connect():
|
def connect():
|
||||||
@@ -26,15 +26,17 @@ def load_points(conn):
|
|||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
select
|
select
|
||||||
id,
|
master_id,
|
||||||
coalesce(provider, '') as provider,
|
source,
|
||||||
coalesce(facility_name, '') as facility_name,
|
coalesce(operator, '') as operator,
|
||||||
|
coalesce(name, '') as name,
|
||||||
coalesce(city, '') as city,
|
coalesce(city, '') as city,
|
||||||
coalesce(state_code, '') as state_code,
|
coalesce(state, '') as state,
|
||||||
longitude,
|
longitude,
|
||||||
latitude,
|
latitude,
|
||||||
coalesce(geocode_source, '') as geocode_source,
|
coalesce(curated_id, '') as curated_id,
|
||||||
coalesce(geocode_precision, '') as geocode_precision,
|
coalesce(osm_id, '') as osm_id,
|
||||||
|
coalesce(match_method, '') as match_method,
|
||||||
coalesce(geoid, '') as geoid
|
coalesce(geoid, '') as geoid
|
||||||
from {POINT_TABLE}
|
from {POINT_TABLE}
|
||||||
where longitude is not null and latitude is not null
|
where longitude is not null and latitude is not null
|
||||||
@@ -47,15 +49,17 @@ def load_points(conn):
|
|||||||
points.append(
|
points.append(
|
||||||
{
|
{
|
||||||
"id": row[0],
|
"id": row[0],
|
||||||
"provider": row[1],
|
"source": row[1],
|
||||||
"facility_name": row[2],
|
"operator": row[2],
|
||||||
"city": row[3],
|
"name": row[3],
|
||||||
"state_code": row[4],
|
"city": row[4],
|
||||||
"lon": float(row[5]),
|
"state": row[5],
|
||||||
"lat": float(row[6]),
|
"lon": float(row[6]),
|
||||||
"geocode_source": row[7],
|
"lat": float(row[7]),
|
||||||
"geocode_precision": row[8],
|
"curated_id": row[8],
|
||||||
"geoid": row[9],
|
"osm_id": row[9],
|
||||||
|
"match_method": row[10],
|
||||||
|
"geoid": row[11],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return points
|
return points
|
||||||
@@ -70,12 +74,12 @@ def compute_center(points):
|
|||||||
|
|
||||||
|
|
||||||
def build_stats(points):
|
def build_stats(points):
|
||||||
by_source = Counter(p["geocode_source"] or "(blank)" for p in points)
|
by_source = Counter(p["source"] or "(blank)" for p in points)
|
||||||
by_precision = Counter(p["geocode_precision"] or "(blank)" for p in points)
|
by_match = Counter(p["match_method"] or "(none)" for p in points)
|
||||||
return {
|
return {
|
||||||
"total": len(points),
|
"total": len(points),
|
||||||
"by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
|
"by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
|
||||||
"by_precision": dict(sorted(by_precision.items(), key=lambda x: x[0])),
|
"by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -89,7 +93,7 @@ def render_html(points, center_lat, center_lon, output_path):
|
|||||||
<head>
|
<head>
|
||||||
<meta charset=\"utf-8\" />
|
<meta charset=\"utf-8\" />
|
||||||
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
||||||
<title>US Data Centers Map</title>
|
<title>US Data Centers Master Map</title>
|
||||||
<link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
|
<link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
|
||||||
<style>
|
<style>
|
||||||
html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
|
html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
|
||||||
@@ -109,17 +113,17 @@ def render_html(points, center_lat, center_lon, output_path):
|
|||||||
<body>
|
<body>
|
||||||
<div id=\"layout\">
|
<div id=\"layout\">
|
||||||
<div id=\"panel\">
|
<div id=\"panel\">
|
||||||
<h1>US Data Centers</h1>
|
<h1>US Data Centers (Master)</h1>
|
||||||
<div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
|
<div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
|
||||||
<h2>Geocode Source</h2>
|
<h2>Source</h2>
|
||||||
<div id=\"sourceStats\"></div>
|
<div id=\"sourceStats\"></div>
|
||||||
<h2>Geocode Precision</h2>
|
<h2>Match Method (merged rows)</h2>
|
||||||
<div id=\"precisionStats\"></div>
|
<div id=\"matchStats\"></div>
|
||||||
<h2>Source Colors</h2>
|
<h2>Source Colors</h2>
|
||||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>IM3_Existing_DataCenters</span></div>
|
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
|
||||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>US Census Geocoder</span></div>
|
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
|
||||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>Nominatim/OpenStreetMap</span></div>
|
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
|
||||||
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>Other/Blank</span></div>
|
<div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
|
||||||
</div>
|
</div>
|
||||||
<div id=\"map\"></div>
|
<div id=\"map\"></div>
|
||||||
</div>
|
</div>
|
||||||
@@ -130,9 +134,9 @@ def render_html(points, center_lat, center_lon, output_path):
|
|||||||
const stats = {stats_json};
|
const stats = {stats_json};
|
||||||
|
|
||||||
function colorForSource(source) {{
|
function colorForSource(source) {{
|
||||||
if (source === 'IM3_Existing_DataCenters') return '#1f77b4';
|
if (source === 'merged') return '#2ca02c';
|
||||||
if (source === 'US Census Geocoder') return '#2ca02c';
|
if (source === 'curated') return '#1f77b4';
|
||||||
if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e';
|
if (source === 'osm') return '#ff7f0e';
|
||||||
return '#7f7f7f';
|
return '#7f7f7f';
|
||||||
}}
|
}}
|
||||||
|
|
||||||
@@ -156,22 +160,26 @@ def render_html(points, center_lat, center_lon, output_path):
|
|||||||
for (const p of points) {{
|
for (const p of points) {{
|
||||||
const marker = L.circleMarker([p.lat, p.lon], {{
|
const marker = L.circleMarker([p.lat, p.lon], {{
|
||||||
radius: 4,
|
radius: 4,
|
||||||
color: colorForSource(p.geocode_source),
|
color: colorForSource(p.source),
|
||||||
fillColor: colorForSource(p.geocode_source),
|
fillColor: colorForSource(p.source),
|
||||||
fillOpacity: 0.7,
|
fillOpacity: 0.7,
|
||||||
weight: 1
|
weight: 1
|
||||||
}});
|
}});
|
||||||
|
|
||||||
const title = p.facility_name || p.id;
|
const title = p.name || p.id;
|
||||||
const provider = p.provider || '(unknown provider)';
|
const operator = p.operator || '(unknown operator)';
|
||||||
const cityState = [p.city, p.state_code].filter(Boolean).join(', ');
|
const cityState = [p.city, p.state].filter(Boolean).join(', ');
|
||||||
|
const provenance = [
|
||||||
|
p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
|
||||||
|
p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
|
||||||
|
p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
|
||||||
|
].filter(Boolean).join('<br>');
|
||||||
marker.bindPopup(`
|
marker.bindPopup(`
|
||||||
<strong>${{escapeHtml(title)}}</strong><br>
|
<strong>${{escapeHtml(title)}}</strong><br>
|
||||||
Provider: ${{escapeHtml(provider)}}<br>
|
Operator: ${{escapeHtml(operator)}}<br>
|
||||||
ID: ${{escapeHtml(p.id)}}<br>
|
|
||||||
Location: ${{escapeHtml(cityState)}}<br>
|
Location: ${{escapeHtml(cityState)}}<br>
|
||||||
Source: ${{escapeHtml(p.geocode_source)}}<br>
|
Source: ${{escapeHtml(p.source)}}<br>
|
||||||
Precision: ${{escapeHtml(p.geocode_precision)}}<br>
|
${{provenance ? provenance + '<br>' : ''}}
|
||||||
GEOID: ${{escapeHtml(p.geoid)}}
|
GEOID: ${{escapeHtml(p.geoid)}}
|
||||||
`);
|
`);
|
||||||
|
|
||||||
@@ -193,12 +201,12 @@ def render_html(points, center_lat, center_lon, output_path):
|
|||||||
sourceStats.appendChild(div);
|
sourceStats.appendChild(div);
|
||||||
}}
|
}}
|
||||||
|
|
||||||
const precisionStats = document.getElementById('precisionStats');
|
const matchStats = document.getElementById('matchStats');
|
||||||
for (const [k, v] of Object.entries(stats.by_precision)) {{
|
for (const [k, v] of Object.entries(stats.by_match_method)) {{
|
||||||
const div = document.createElement('div');
|
const div = document.createElement('div');
|
||||||
div.className = 'stat-row';
|
div.className = 'stat-row';
|
||||||
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
|
div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
|
||||||
precisionStats.appendChild(div);
|
matchStats.appendChild(div);
|
||||||
}}
|
}}
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import psycopg2
|
|||||||
|
|
||||||
|
|
||||||
DB_NAME = "data_centers"
|
DB_NAME = "data_centers"
|
||||||
DC_TABLE = "public.us_dc_sample_geocoded"
|
DC_TABLE = "public.master_data_centers"
|
||||||
CABLES_TABLE = "public.internet_cables"
|
CABLES_TABLE = "public.internet_cables"
|
||||||
CITY_TABLE = "public.internet_city_dominance"
|
CITY_TABLE = "public.internet_city_dominance"
|
||||||
|
|
||||||
@@ -30,14 +30,14 @@ def load_data_centers(conn):
|
|||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
select
|
select
|
||||||
id,
|
master_id,
|
||||||
coalesce(provider, ''),
|
source,
|
||||||
coalesce(facility_name, ''),
|
coalesce(operator, ''),
|
||||||
|
coalesce(name, ''),
|
||||||
coalesce(city, ''),
|
coalesce(city, ''),
|
||||||
coalesce(state_code, ''),
|
coalesce(state, ''),
|
||||||
longitude,
|
longitude,
|
||||||
latitude,
|
latitude
|
||||||
coalesce(geocode_source, '')
|
|
||||||
from {DC_TABLE}
|
from {DC_TABLE}
|
||||||
where longitude is not null and latitude is not null
|
where longitude is not null and latitude is not null
|
||||||
"""
|
"""
|
||||||
@@ -45,13 +45,13 @@ def load_data_centers(conn):
|
|||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
"id": r[0],
|
"id": r[0],
|
||||||
"provider": r[1],
|
"source": r[1],
|
||||||
"facility_name": r[2],
|
"operator": r[2],
|
||||||
"city": r[3],
|
"name": r[3],
|
||||||
"state_code": r[4],
|
"city": r[4],
|
||||||
"lon": float(r[5]),
|
"state": r[5],
|
||||||
"lat": float(r[6]),
|
"lon": float(r[6]),
|
||||||
"geocode_source": r[7],
|
"lat": float(r[7]),
|
||||||
}
|
}
|
||||||
for r in cur.fetchall()
|
for r in cur.fetchall()
|
||||||
]
|
]
|
||||||
@@ -181,10 +181,10 @@ def render_html(data_centers, cables_geojson, cities, output_path):
|
|||||||
<label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
|
<label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
|
||||||
|
|
||||||
<h2>Data center source</h2>
|
<h2>Data center source</h2>
|
||||||
<div class="row"><span><span class="swatch" style="background:#1f77b4"></span>IM3_Existing_DataCenters</span></div>
|
<div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
|
||||||
<div class="row"><span><span class="swatch" style="background:#2ca02c"></span>US Census Geocoder</span></div>
|
<div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
|
||||||
<div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>Nominatim/OpenStreetMap</span></div>
|
<div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
|
||||||
<div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>Other</span></div>
|
<div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
|
||||||
|
|
||||||
<h2>City dominance</h2>
|
<h2>City dominance</h2>
|
||||||
<div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
|
<div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
|
||||||
@@ -197,9 +197,9 @@ def render_html(data_centers, cables_geojson, cities, output_path):
|
|||||||
const DATA = __PAYLOAD__;
|
const DATA = __PAYLOAD__;
|
||||||
|
|
||||||
function colorForSource(source) {
|
function colorForSource(source) {
|
||||||
if (source === 'IM3_Existing_DataCenters') return '#1f77b4';
|
if (source === 'merged') return '#2ca02c';
|
||||||
if (source === 'US Census Geocoder') return '#2ca02c';
|
if (source === 'curated') return '#1f77b4';
|
||||||
if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e';
|
if (source === 'osm') return '#ff7f0e';
|
||||||
return '#7f7f7f';
|
return '#7f7f7f';
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -262,19 +262,19 @@ def render_html(data_centers, cables_geojson, cities, output_path):
|
|||||||
for (const p of DATA.data_centers) {
|
for (const p of DATA.data_centers) {
|
||||||
const m = L.circleMarker([p.lat, p.lon], {
|
const m = L.circleMarker([p.lat, p.lon], {
|
||||||
radius: 3,
|
radius: 3,
|
||||||
color: colorForSource(p.geocode_source),
|
color: colorForSource(p.source),
|
||||||
fillColor: colorForSource(p.geocode_source),
|
fillColor: colorForSource(p.source),
|
||||||
fillOpacity: 0.85,
|
fillOpacity: 0.85,
|
||||||
weight: 0.8,
|
weight: 0.8,
|
||||||
});
|
});
|
||||||
const title = p.facility_name || p.id;
|
const title = p.name || p.id;
|
||||||
const provider = p.provider || '(unknown provider)';
|
const operator = p.operator || '(unknown operator)';
|
||||||
const cityState = [p.city, p.state_code].filter(Boolean).join(', ');
|
const cityState = [p.city, p.state].filter(Boolean).join(', ');
|
||||||
m.bindPopup(`
|
m.bindPopup(`
|
||||||
<strong>${esc(title)}</strong><br>
|
<strong>${esc(title)}</strong><br>
|
||||||
Provider: ${esc(provider)}<br>
|
Operator: ${esc(operator)}<br>
|
||||||
Location: ${esc(cityState)}<br>
|
Location: ${esc(cityState)}<br>
|
||||||
Source: ${esc(p.geocode_source)}
|
Source: ${esc(p.source)}
|
||||||
`);
|
`);
|
||||||
dcLayer.addLayer(m);
|
dcLayer.addLayer(m);
|
||||||
dcBounds.push([p.lat, p.lon]);
|
dcBounds.push([p.lat, p.lon]);
|
||||||
|
|||||||
1
output/overpass_building_data_center.json
Normal file
1
output/overpass_building_data_center.json
Normal file
File diff suppressed because one or more lines are too long
1
output/overpass_telecom_data_center.json
Normal file
1
output/overpass_telecom_data_center.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user