Add master data center merge workflow

2026-05-17 18:53:16 -07:00
parent 8fcbb18e37
commit 90e8b21423
10 changed files with 11892 additions and 9599 deletions
--- a/build_master_data_centers.py
+++ b/build_master_data_centers.py
@@ -0,0 +1,258 @@
 #!/usr/bin/env python3
 """
 Build (or refresh) public.master_data_centers by merging:
  - public.us_dc_sample_geocoded   (curated, attribute-rich)
  - public.osm_data_centers        (OpenStreetMap features)
 Deduplication rule (curated row wins):
  Step 1: for each curated row, find a matching OSM row by
            curated.id = osm.osm_id::text                                OR
            curated.nominatim_osm_id = osm.osm_id                        OR
            ST_DWithin(curated.geom, osm.geom, 150 m, geography)
          (closest match by sphere distance when multiple).
  Step 2: insert every curated row into master, filling NULLs from the
          matched OSM row when present. source = 'merged' if matched,
          otherwise 'curated'.
  Step 3: insert every OSM row whose osm_id was NOT matched in Step 1.
          source = 'osm'.
 Result: every curated row appears once; OSM-only rows appear once; no row is
 emitted twice. The merge logic lives in a SQL function
 public.refresh_master_data_centers() so subsequent refreshes are one call.
 """
 import argparse
 import os
 import sys
 import psycopg2
 DB_NAME = "data_centers"
 MASTER_TABLE = "public.master_data_centers"
 CURATED_TABLE = "public.us_dc_sample_geocoded"
 OSM_TABLE = "public.osm_data_centers"
 MATCH_RADIUS_M = 150
 CREATE_TABLE_SQL = f"""
 create table if not exists {MASTER_TABLE} (
    master_id              text primary key,
    source                 text not null check (source in ('curated','osm','merged')),
    curated_id             text,
    osm_id                 text,
    name                   text,
    operator               text,
    street_address         text,
    city                   text,
    state                  text,
    postal_code            text,
    country                text,
    website                text,
    phone                  text,
    power_mw               numeric,
    area_sqft              integer,
    nearest_airport_miles  numeric,
    has_bare_metal         boolean,
    has_iaas               boolean,
    has_internet_exchange  boolean,
    has_colocation         boolean,
    certifications         text,
    content_summary        text,
    osm_tags               jsonb,
    matched_osm_tag_passes text[],
    match_method           text,
    match_distance_m       numeric,
    longitude              double precision not null,
    latitude               double precision not null,
    geom                   geometry(Point, 4326)
        generated always as (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
 );
 create index if not exists master_data_centers_geom_gix on {MASTER_TABLE} using gist (geom);
 create index if not exists master_data_centers_source_idx on {MASTER_TABLE} (source);
 create index if not exists master_data_centers_state_idx on {MASTER_TABLE} (state);
 create index if not exists master_data_centers_curated_idx on {MASTER_TABLE} (curated_id);
 create index if not exists master_data_centers_osm_idx on {MASTER_TABLE} (osm_id);
 """
 REFRESH_FUNCTION_SQL = f"""
 create or replace function public.refresh_master_data_centers(match_radius_m double precision default {MATCH_RADIUS_M})
 returns table (curated_rows bigint, merged_rows bigint, osm_only_rows bigint, total_rows bigint)
 language plpgsql
 as $$
 begin
    truncate table {MASTER_TABLE};
    -- pick a single best OSM match for each curated row, prioritizing ID
    -- equality, then nominatim id, then closest within radius
    create temporary table _curated_to_osm on commit drop as
    with ranked as (
        select
            c.id                                  as curated_id,
            o.id                                  as osm_id,
            case
                when c.id = o.osm_id::text       then 'id'
                when c.nominatim_osm_id = o.osm_id then 'nominatim_id'
                else 'spatial'
            end                                  as method,
            ST_DistanceSphere(c.geom, o.geom)    as dist_m,
            row_number() over (
                partition by c.id
                order by
                    case
                        when c.id = o.osm_id::text       then 0
                        when c.nominatim_osm_id = o.osm_id then 1
                        else 2
                    end,
                    ST_DistanceSphere(c.geom, o.geom) asc
            )                                    as rn
        from {CURATED_TABLE} c
        join {OSM_TABLE} o
          on c.id = o.osm_id::text
          or c.nominatim_osm_id = o.osm_id
          or ST_DWithin(c.geom::geography, o.geom::geography, match_radius_m)
    )
    select curated_id, osm_id, method, dist_m
    from ranked
    where rn = 1;
    -- Step 1+2: insert curated rows (with OSM nulls filled where matched)
    insert into {MASTER_TABLE} (
        master_id, source, curated_id, osm_id,
        name, operator, street_address, city, state, postal_code, country,
        website, phone, power_mw, area_sqft, nearest_airport_miles,
        has_bare_metal, has_iaas, has_internet_exchange, has_colocation,
        certifications, content_summary,
        osm_tags, matched_osm_tag_passes,
        match_method, match_distance_m,
        longitude, latitude
    )
    select
        'curated/' || c.id,
        case when m.osm_id is not null then 'merged' else 'curated' end,
        c.id,
        m.osm_id,
        coalesce(c.facility_name, o.name),
        coalesce(c.provider, o.operator),
        coalesce(c.street_address, o.street_address),
        coalesce(c.city, o.city),
        coalesce(c.state_code, o.state),
        coalesce(c.postal_code, o.postal_code),
        coalesce(c.country, o.country),
        coalesce(c.url, o.website),
        coalesce(c.phone, o.phone),
        c.power_mw,
        c.area_sqft,
        c.nearest_airport_miles,
        c.has_bare_metal,
        c.has_iaas,
        c.has_internet_exchange,
        c.has_colocation,
        c.certifications,
        c.content_summary,
        o.tags,
        o.matched_tags,
        m.method,
        round(m.dist_m::numeric, 2),
        c.longitude,
        c.latitude
    from {CURATED_TABLE} c
    left join _curated_to_osm m on m.curated_id = c.id
    left join {OSM_TABLE} o on o.id = m.osm_id;
    -- Step 3: insert OSM rows that no curated row claimed
    insert into {MASTER_TABLE} (
        master_id, source, curated_id, osm_id,
        name, operator, street_address, city, state, postal_code, country,
        website, phone,
        osm_tags, matched_osm_tag_passes,
        longitude, latitude
    )
    select
        'osm/' || o.id,
        'osm',
        null,
        o.id,
        o.name,
        o.operator,
        o.street_address,
        o.city,
        o.state,
        o.postal_code,
        o.country,
        o.website,
        o.phone,
        o.tags,
        o.matched_tags,
        o.longitude,
        o.latitude
    from {OSM_TABLE} o
    where not exists (
        select 1 from _curated_to_osm m where m.osm_id = o.id
    );
    analyze {MASTER_TABLE};
    return query
    select
        count(*) filter (where source = 'curated'),
        count(*) filter (where source = 'merged'),
        count(*) filter (where source = 'osm'),
        count(*)
    from {MASTER_TABLE};
 end;
 $$;
 """
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--radius-m",
        type=float,
        default=MATCH_RADIUS_M,
        help=f"Spatial match radius in meters (default: {MATCH_RADIUS_M}).",
    )
    parser.add_argument(
        "--recreate",
        action="store_true",
        help=f"Drop and recreate {MASTER_TABLE} before building.",
    )
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    conn = psycopg2.connect(
        host=os.environ["PGWEB_HOST"],
        port=os.environ["PGWEB_PORT"],
        user=os.environ["PGWEB_USER"],
        password=os.environ["PGWEB_PASSWORD"],
        dbname=DB_NAME,
    )
    try:
        with conn:
            with conn.cursor() as cur:
                cur.execute("create extension if not exists postgis")
                if args.recreate:
                    cur.execute(f"drop table if exists {MASTER_TABLE} cascade")
                cur.execute(CREATE_TABLE_SQL)
                cur.execute(REFRESH_FUNCTION_SQL)
                cur.execute(
                    "select * from public.refresh_master_data_centers(%s)",
                    (args.radius_m,),
                )
                curated, merged, osm_only, total = cur.fetchone()
    finally:
        conn.close()
    print(f"master_data_centers refreshed (radius={args.radius_m} m):")
    print(f"  curated-only rows: {curated}")
    print(f"  merged rows (curated + OSM):  {merged}")
    print(f"  osm-only rows:     {osm_only}")
    print(f"  total:             {total}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/census_tract_acs_2024_selected_states.csv
+++ b/census_tract_acs_2024_selected_states.csv
--- a/create_data_center_census_tract_table.py
+++ b/create_data_center_census_tract_table.py
@@ -14,7 +14,8 @@ from psycopg2.extras import execute_values
 DB_NAME = "data_centers"
-POINT_TABLE = "public.us_dc_sample_geocoded"
+POINT_TABLE = "public.master_data_centers"
 POINT_ID_COL = "master_id"
 BOUNDARY_STAGE_TABLE = "public._dc_census_tract_boundaries_2024"
 ACS_STAGE_TABLE = "public._dc_census_tract_acs_2024"
 FINAL_TABLE = "public.data_center_census_tracts_2024"
@@ -27,6 +28,25 @@ TRACT_ZIP_URL = (
 )
 ACS_AUDIT_CSV = Path("census_tract_acs_2024_selected_states.csv")
 STATE_NAME_TO_CODE = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
    "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
    "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
    "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
    "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
    "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
    "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
    "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
    "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
    "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
    "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
    "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR", "United States Virgin Islands": "VI",
    "U.S. Virgin Islands": "VI", "Virgin Islands": "VI",
 }
 STATE_FIPS = {
    "AL": "01",
    "AK": "02",
@@ -198,16 +218,45 @@ def connect():
    )
 def normalize_state(value):
    if value in (None, ""):
        return None
    if value in STATE_FIPS:
        return value
    return STATE_NAME_TO_CODE.get(value.strip())
 def get_state_fips(conn):
    with conn.cursor() as cur:
        cur.execute(
-            f"select distinct state_code from {POINT_TABLE} order by state_code"
+            f"select state, count(*) from {POINT_TABLE} group by state order by state nulls last"
        )
-        state_codes = [row[0] for row in cur.fetchall()]
+        rows = cur.fetchall()
-    missing = [code for code in state_codes if code not in STATE_FIPS]
+    normalized_counts = {}
-    if missing:
+    null_state_count = 0
-        raise RuntimeError(f"Missing state FIPS mappings for: {', '.join(missing)}")
+    unknown = []
-    return [STATE_FIPS[code] for code in state_codes]
+    for raw, count in rows:
        if raw is None:
            null_state_count += count
            continue
        code = normalize_state(raw)
        if code is None:
            unknown.append((raw, count))
            continue
        normalized_counts[code] = normalized_counts.get(code, 0) + count
    if unknown:
        details = ", ".join(f"{repr(name)}({n})" for name, n in unknown)
        raise RuntimeError(f"Unrecognized state values in {POINT_TABLE}: {details}")
    if null_state_count:
        print(
            f"warning: {null_state_count} master_data_centers rows have NULL state; "
            f"importing tract boundaries for all 50 states + DC + PR so spatial join can resolve them."
        )
        # Census ACS 5-year DP profile lacks coverage for the small island territories;
        # restrict to the 50 states + DC + PR which the ACS profile reliably serves.
        allowed = {"AS", "GU", "MP", "VI"}
        return sorted({fips for code, fips in STATE_FIPS.items() if code not in allowed})
    return sorted({STATE_FIPS[code] for code in normalized_counts})
 def ensure_final_table_absent(conn):
@@ -290,8 +339,20 @@ def fetch_acs_for_state(state_fips):
        f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile?"
        + urllib.parse.urlencode(params)
    )
-    with urllib.request.urlopen(url, timeout=120) as response:
+    try:
-        data = json.loads(response.read().decode("utf-8"))
+        with urllib.request.urlopen(url, timeout=120) as response:
            body = response.read().decode("utf-8")
    except urllib.error.HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(
            f"Census ACS request failed for state {state_fips}: HTTP {exc.code} — {body[:300]}"
        ) from exc
    try:
        data = json.loads(body)
    except json.JSONDecodeError as exc:
        raise RuntimeError(
            f"Census ACS returned non-JSON for state {state_fips}: {body[:300]}"
        ) from exc
    header = data[0]
    rows = []
@@ -444,12 +505,15 @@ def create_final_table(conn):
                    select
                        t.geoid,
                        count(*)::integer as data_center_count,
-                        count(*) filter (where dc.geocode_precision = 'address_range')::integer
+                        count(*) filter (where dc.source = 'curated')::integer
-                            as address_range_data_center_count,
+                            as curated_only_data_center_count,
-                        count(*) filter (where dc.geocode_precision = 'city')::integer
+                        count(*) filter (where dc.source = 'merged')::integer
-                            as city_precision_data_center_count,
+                            as merged_data_center_count,
-                        array_agg(dc.id order by dc.id) as data_center_ids,
+                        count(*) filter (where dc.source = 'osm')::integer
-                        array_agg(distinct dc.provider order by dc.provider) as providers
+                            as osm_only_data_center_count,
                        array_agg(dc.{POINT_ID_COL} order by dc.{POINT_ID_COL}) as data_center_ids,
                        array_agg(distinct dc.operator) filter (where dc.operator is not null)
                            as operators
                    from {BOUNDARY_STAGE_TABLE} t
                    join {POINT_TABLE} dc
                        on t.geom && dc.geom
@@ -469,10 +533,11 @@ def create_final_table(conn):
                    '{ACS_SOURCE}'::text as acs_source,
                    a.acs_name,
                    d.data_center_count,
-                    d.address_range_data_center_count,
+                    d.curated_only_data_center_count,
-                    d.city_precision_data_center_count,
+                    d.merged_data_center_count,
                    d.osm_only_data_center_count,
                    d.data_center_ids,
-                    d.providers,
+                    d.operators,
                    a.population,
                    a.median_age,
                    a.households,
@@ -532,7 +597,7 @@ def create_final_table(conn):
            cur.execute(
                f"""
                comment on table {FINAL_TABLE} is
-                'Census tracts containing records from public.us_dc_sample_geocoded, enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
+                'Census tracts containing records from public.master_data_centers (curated + OSM merged), enriched with ACS 2024 5-year profile demographics and derived primary industry fields.'
                """
            )
            cur.execute(f"analyze {FINAL_TABLE}")
@@ -550,7 +615,7 @@ def assign_point_geoids(conn):
                set geoid = matched.geoid
                from (
                    select
-                        dc_inner.id,
+                        dc_inner.{POINT_ID_COL} as point_id,
                        (
                            select t.geoid
                            from {BOUNDARY_STAGE_TABLE} t
@@ -561,11 +626,11 @@ def assign_point_geoids(conn):
                        ) as geoid
                    from {POINT_TABLE} dc_inner
                ) matched
-                where dc.id = matched.id
+                where dc.{POINT_ID_COL} = matched.point_id
                """
            )
            cur.execute(
-                f"create index if not exists us_dc_sample_geocoded_geoid_idx on {POINT_TABLE} (geoid)"
+                f"create index if not exists master_data_centers_geoid_idx on {POINT_TABLE} (geoid)"
            )
            cur.execute(f"analyze {POINT_TABLE}")
@@ -586,13 +651,21 @@ def validate(conn):
        total_points = cur.fetchone()[0]
        cur.execute(
            f"""
-            select geocode_precision, count(*)::integer
+            select source, count(*)::integer
            from {POINT_TABLE}
-            group by geocode_precision
+            group by source
-            order by geocode_precision
+            order by source
            """
        )
-        point_precision = cur.fetchall()
+        point_source_breakdown = cur.fetchall()
        cur.execute(
            f"""
            select count(*)::integer
            from {POINT_TABLE}
            where geoid is null
            """
        )
        unassigned_points = cur.fetchone()[0]
        cur.execute(
            f"""
            select count(*)::integer
@@ -601,7 +674,7 @@ def validate(conn):
            """
        )
        missing_acs = cur.fetchone()[0]
-    return summary, total_points, point_precision, missing_acs
+    return summary, total_points, point_source_breakdown, unassigned_points, missing_acs
 def main():
@@ -638,7 +711,7 @@ def main():
        load_acs_stage(conn, acs_rows, acs_fieldnames)
        create_final_table(conn)
        assign_point_geoids(conn)
-        summary, total_points, point_precision, missing_acs = validate(conn)
+        summary, total_points, point_source_breakdown, unassigned_points, missing_acs = validate(conn)
    finally:
        conn.close()
@@ -649,7 +722,8 @@ def main():
            summary[0], summary[1], summary[2], total_points
        )
    )
-    print("point_precision=" + ", ".join(f"{k}:{v}" for k, v in point_precision))
+    print("point_source=" + ", ".join(f"{k}:{v}" for k, v in point_source_breakdown))
    print(f"points_unassigned_to_tract={unassigned_points}")
    print(f"tracts_missing_acs_population={missing_acs}")
--- a/data_center_map.html
+++ b/data_center_map.html
--- a/data_centers_cables_map.html
+++ b/data_centers_cables_map.html
--- a/load_postgis_osm_data_centers.py
+++ b/load_postgis_osm_data_centers.py
@@ -0,0 +1,376 @@
 #!/usr/bin/env python3
 """
 Fetch US data centers from OpenStreetMap (Overpass API) and load them into
 public.osm_data_centers in the data_centers database. Also (re)creates a
 unioned view public.data_centers_union combining OSM + curated rows from
 public.us_dc_sample_geocoded.
 Two Overpass passes are made because tagging is inconsistent:
  1) telecom=data_center
  2) building=data_center
 Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
 in match_tags so we can see which query found each feature.
 """
 import argparse
 import json
 import os
 import sys
 import time
 from typing import Dict, List, Optional, Tuple
 import psycopg2
 import requests
 from psycopg2.extras import Json, execute_values
 OVERPASS_URL = "https://overpass-api.de/api/interpreter"
 TABLE = "public.osm_data_centers"
 VIEW = "public.data_centers_union"
 CURATED_TABLE = "public.us_dc_sample_geocoded"
 DB_NAME = "data_centers"
 # Tag passes: (key, value)
 TAG_PASSES = [
    ("telecom", "data_center"),
    ("building", "data_center"),
 ]
 def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
    return f"""
 [out:json][timeout:{timeout}];
 area["ISO3166-1"="US"][admin_level=2]->.us;
 (
  node["{tag_key}"="{tag_value}"](area.us);
  way["{tag_key}"="{tag_value}"](area.us);
  relation["{tag_key}"="{tag_value}"](area.us);
 );
 out center tags;
 """.strip()
 def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
    if cache_path and os.path.exists(cache_path):
        print(f"  using cached response: {cache_path}")
        with open(cache_path, "r", encoding="utf-8") as fh:
            payload = json.load(fh)
    else:
        query = overpass_query(tag_key, tag_value)
        print(f"  querying Overpass for {tag_key}={tag_value} ...")
        headers = {
            "User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
            "Accept": "application/json",
        }
        resp = requests.post(
            OVERPASS_URL,
            data={"data": query},
            headers=headers,
            timeout=240,
        )
        if resp.status_code != 200:
            print(f"  Overpass returned {resp.status_code}: {resp.text[:500]}")
        resp.raise_for_status()
        payload = resp.json()
        if cache_path:
            with open(cache_path, "w", encoding="utf-8") as fh:
                json.dump(payload, fh)
            print(f"  cached to {cache_path}")
    elements = payload.get("elements", [])
    print(f"  pass returned {len(elements)} elements")
    return elements
 def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
    if elem.get("type") == "node":
        return elem.get("lon"), elem.get("lat")
    center = elem.get("center") or {}
    return center.get("lon"), center.get("lat")
 def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
    lon, lat = element_coords(elem)
    if lon is None or lat is None:
        return None
    osm_type = elem.get("type")
    osm_id = elem.get("id")
    if osm_type is None or osm_id is None:
        return None
    tags = elem.get("tags") or {}
    return {
        "id": f"{osm_type}/{osm_id}",
        "osm_type": osm_type,
        "osm_id": int(osm_id),
        "name": tags.get("name"),
        "operator": tags.get("operator"),
        "operator_type": tags.get("operator:type"),
        "telecom": tags.get("telecom"),
        "building": tags.get("building"),
        "power": tags.get("power"),
        "website": tags.get("website") or tags.get("contact:website"),
        "phone": tags.get("phone") or tags.get("contact:phone"),
        "street_address": " ".join(
            part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
        ) or None,
        "city": tags.get("addr:city"),
        "state": tags.get("addr:state"),
        "postal_code": tags.get("addr:postcode"),
        "country": tags.get("addr:country") or "US",
        "matched_tags": [matched_tag],
        "tags": tags,
        "longitude": float(lon),
        "latitude": float(lat),
    }
 def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
    for row in new_rows:
        key = row["id"]
        prior = existing.get(key)
        if prior is None:
            existing[key] = row
            continue
        # merge matched_tags; keep first non-null values for other fields
        merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
        prior["matched_tags"] = merged_tags
        for col, val in row.items():
            if col == "matched_tags":
                continue
            if prior.get(col) in (None, "") and val not in (None, ""):
                prior[col] = val
 COLUMNS = [
    "id",
    "osm_type",
    "osm_id",
    "name",
    "operator",
    "operator_type",
    "telecom",
    "building",
    "power",
    "website",
    "phone",
    "street_address",
    "city",
    "state",
    "postal_code",
    "country",
    "matched_tags",
    "tags",
    "longitude",
    "latitude",
 ]
 def row_to_tuple(row: dict) -> tuple:
    return (
        row["id"],
        row["osm_type"],
        row["osm_id"],
        row.get("name"),
        row.get("operator"),
        row.get("operator_type"),
        row.get("telecom"),
        row.get("building"),
        row.get("power"),
        row.get("website"),
        row.get("phone"),
        row.get("street_address"),
        row.get("city"),
        row.get("state"),
        row.get("postal_code"),
        row.get("country"),
        row.get("matched_tags", []),
        Json(row.get("tags", {})),
        row["longitude"],
        row["latitude"],
    )
 def create_table(cur) -> None:
    cur.execute(
        f"""
        create table {TABLE} (
            id text primary key,
            osm_type text not null,
            osm_id bigint not null,
            name text,
            operator text,
            operator_type text,
            telecom text,
            building text,
            power text,
            website text,
            phone text,
            street_address text,
            city text,
            state text,
            postal_code text,
            country text,
            matched_tags text[] not null default '{{}}',
            tags jsonb not null default '{{}}'::jsonb,
            longitude double precision not null,
            latitude double precision not null,
            ingested_at timestamptz not null default now(),
            geom geometry(Point, 4326) generated always as
                (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
        )
        """
    )
    cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
    cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
    cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
 def insert_values(cur, rows: List[dict], upsert: bool) -> None:
    sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
    if upsert:
        update_cols = [c for c in COLUMNS if c != "id"]
        assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
        sql += (
            f" on conflict (id) do update set {assignments}, "
            f"ingested_at = now()"
        )
    execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
 def create_or_replace_view(cur) -> None:
    cur.execute(
        f"""
        create or replace view {VIEW} as
        select
            'curated/' || id as id,
            'curated'::text as source,
            facility_name as name,
            provider as operator,
            street_address,
            city,
            state_code as state,
            postal_code,
            country,
            url as website,
            phone,
            longitude,
            latitude,
            geom
        from {CURATED_TABLE}
        union all
        select
            id,
            'osm'::text as source,
            name,
            operator,
            street_address,
            city,
            state,
            postal_code,
            country,
            website,
            phone,
            longitude,
            latitude,
            geom
        from {TABLE}
        """
    )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--cache-dir",
        default="output",
        help="Directory to cache raw Overpass responses (default: output/).",
    )
    parser.add_argument(
        "--no-cache",
        action="store_true",
        help="Do not read or write Overpass cache files; always hit the API.",
    )
    parser.add_argument(
        "--recreate",
        action="store_true",
        help=f"Drop and recreate {TABLE} before loading.",
    )
    parser.add_argument(
        "--upsert",
        action="store_true",
        default=True,
        help="On id conflicts, update the existing row (default: on).",
    )
    parser.add_argument(
        "--skip-view",
        action="store_true",
        help=f"Do not create/replace the unioned view {VIEW}.",
    )
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    os.makedirs(args.cache_dir, exist_ok=True)
    merged: Dict[str, dict] = {}
    for tag_key, tag_value in TAG_PASSES:
        cache_path = (
            None
            if args.no_cache
            else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
        )
        print(f"Pass: {tag_key}={tag_value}")
        elements = fetch_pass(tag_key, tag_value, cache_path)
        normalized = [
            row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
            if row is not None
        ]
        print(f"  normalized {len(normalized)} rows with coords")
        merge_records(merged, normalized)
        # be polite to Overpass between passes
        time.sleep(2)
    rows = list(merged.values())
    print(f"Total deduped OSM data-center features: {len(rows)}")
    if not rows:
        print("No rows fetched; aborting DB load.", file=sys.stderr)
        return 1
    conn = psycopg2.connect(
        host=os.environ["PGWEB_HOST"],
        port=os.environ["PGWEB_PORT"],
        user=os.environ["PGWEB_USER"],
        password=os.environ["PGWEB_PASSWORD"],
        dbname=DB_NAME,
    )
    try:
        with conn:
            with conn.cursor() as cur:
                cur.execute("create extension if not exists postgis")
                if args.recreate:
                    cur.execute(f"drop table if exists {TABLE} cascade")
                cur.execute("select to_regclass(%s)", (TABLE,))
                if cur.fetchone()[0] is None:
                    create_table(cur)
                insert_values(cur, rows, upsert=args.upsert)
                cur.execute(f"analyze {TABLE}")
                if not args.skip_view:
                    cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
                    if cur.fetchone()[0] is not None:
                        create_or_replace_view(cur)
                        print(f"View {VIEW} (re)created.")
                    else:
                        print(
                            f"Skipping view: {CURATED_TABLE} does not exist.",
                            file=sys.stderr,
                        )
                cur.execute(f"select count(*) from {TABLE}")
                total = cur.fetchone()[0]
    finally:
        conn.close()
    print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/make_data_center_map.py
+++ b/make_data_center_map.py
@@ -8,7 +8,7 @@ import psycopg2
 DB_NAME = "data_centers"
-POINT_TABLE = "public.us_dc_sample_geocoded"
+POINT_TABLE = "public.master_data_centers"
 def connect():
@@ -26,15 +26,17 @@ def load_points(conn):
        cur.execute(
            f"""
            select
-                id,
+                master_id,
-                coalesce(provider, '') as provider,
+                source,
-                coalesce(facility_name, '') as facility_name,
+                coalesce(operator, '') as operator,
                coalesce(name, '') as name,
                coalesce(city, '') as city,
-                coalesce(state_code, '') as state_code,
+                coalesce(state, '') as state,
                longitude,
                latitude,
-                coalesce(geocode_source, '') as geocode_source,
+                coalesce(curated_id, '') as curated_id,
-                coalesce(geocode_precision, '') as geocode_precision,
+                coalesce(osm_id, '') as osm_id,
                coalesce(match_method, '') as match_method,
                coalesce(geoid, '') as geoid
            from {POINT_TABLE}
            where longitude is not null and latitude is not null
@@ -47,15 +49,17 @@ def load_points(conn):
        points.append(
            {
                "id": row[0],
-                "provider": row[1],
+                "source": row[1],
-                "facility_name": row[2],
+                "operator": row[2],
-                "city": row[3],
+                "name": row[3],
-                "state_code": row[4],
+                "city": row[4],
-                "lon": float(row[5]),
+                "state": row[5],
-                "lat": float(row[6]),
+                "lon": float(row[6]),
-                "geocode_source": row[7],
+                "lat": float(row[7]),
-                "geocode_precision": row[8],
+                "curated_id": row[8],
-                "geoid": row[9],
+                "osm_id": row[9],
                "match_method": row[10],
                "geoid": row[11],
            }
        )
    return points
@@ -70,12 +74,12 @@ def compute_center(points):
 def build_stats(points):
-    by_source = Counter(p["geocode_source"] or "(blank)" for p in points)
+    by_source = Counter(p["source"] or "(blank)" for p in points)
-    by_precision = Counter(p["geocode_precision"] or "(blank)" for p in points)
+    by_match = Counter(p["match_method"] or "(none)" for p in points)
    return {
        "total": len(points),
        "by_source": dict(sorted(by_source.items(), key=lambda x: x[0])),
-        "by_precision": dict(sorted(by_precision.items(), key=lambda x: x[0])),
+        "by_match_method": dict(sorted(by_match.items(), key=lambda x: x[0])),
    }
@@ -89,7 +93,7 @@ def render_html(points, center_lat, center_lon, output_path):
 <head>
  <meta charset=\"utf-8\" />
  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
-  <title>US Data Centers Map</title>
+  <title>US Data Centers Master Map</title>
  <link rel=\"stylesheet\" href=\"https://unpkg.com/leaflet@1.9.4/dist/leaflet.css\" />
  <style>
    html, body {{ height: 100%; margin: 0; font-family: system-ui, -apple-system, Segoe UI, sans-serif; }}
@@ -109,17 +113,17 @@ def render_html(points, center_lat, center_lon, output_path):
 <body>
  <div id=\"layout\">
    <div id=\"panel\">
-      <h1>US Data Centers</h1>
+      <h1>US Data Centers (Master)</h1>
      <div class=\"stat-row\"><span>Total points</span><strong id=\"total\"></strong></div>
-      <h2>Geocode Source</h2>
+      <h2>Source</h2>
      <div id=\"sourceStats\"></div>
-      <h2>Geocode Precision</h2>
+      <h2>Match Method (merged rows)</h2>
-      <div id=\"precisionStats\"></div>
+      <div id=\"matchStats\"></div>
      <h2>Source Colors</h2>
-      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>IM3_Existing_DataCenters</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>merged (curated + OSM)</span></div>
-      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#2ca02c\"></span>US Census Geocoder</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#1f77b4\"></span>curated only</span></div>
-      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>Nominatim/OpenStreetMap</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#ff7f0e\"></span>osm only</span></div>
-      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>Other/Blank</span></div>
+      <div class=\"stat-row\"><span><span class=\"dot\" style=\"background:#7f7f7f\"></span>other</span></div>
    </div>
    <div id=\"map\"></div>
  </div>
@@ -130,9 +134,9 @@ def render_html(points, center_lat, center_lon, output_path):
    const stats = {stats_json};
    function colorForSource(source) {{
-      if (source === 'IM3_Existing_DataCenters') return '#1f77b4';
+      if (source === 'merged') return '#2ca02c';
-      if (source === 'US Census Geocoder') return '#2ca02c';
+      if (source === 'curated') return '#1f77b4';
-      if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e';
+      if (source === 'osm') return '#ff7f0e';
      return '#7f7f7f';
    }}
@@ -156,22 +160,26 @@ def render_html(points, center_lat, center_lon, output_path):
    for (const p of points) {{
      const marker = L.circleMarker([p.lat, p.lon], {{
        radius: 4,
-        color: colorForSource(p.geocode_source),
+        color: colorForSource(p.source),
-        fillColor: colorForSource(p.geocode_source),
+        fillColor: colorForSource(p.source),
        fillOpacity: 0.7,
        weight: 1
      }});
-      const title = p.facility_name || p.id;
+      const title = p.name || p.id;
-      const provider = p.provider || '(unknown provider)';
+      const operator = p.operator || '(unknown operator)';
-      const cityState = [p.city, p.state_code].filter(Boolean).join(', ');
+      const cityState = [p.city, p.state].filter(Boolean).join(', ');
      const provenance = [
        p.curated_id ? 'curated_id=' + escapeHtml(p.curated_id) : null,
        p.osm_id ? 'osm_id=' + escapeHtml(p.osm_id) : null,
        p.match_method ? 'match=' + escapeHtml(p.match_method) : null,
      ].filter(Boolean).join('<br>');
      marker.bindPopup(`
        <strong>${{escapeHtml(title)}}</strong><br>
-        Provider: ${{escapeHtml(provider)}}<br>
+        Operator: ${{escapeHtml(operator)}}<br>
        ID: ${{escapeHtml(p.id)}}<br>
        Location: ${{escapeHtml(cityState)}}<br>
-        Source: ${{escapeHtml(p.geocode_source)}}<br>
+        Source: ${{escapeHtml(p.source)}}<br>
-        Precision: ${{escapeHtml(p.geocode_precision)}}<br>
+        ${{provenance ? provenance + '<br>' : ''}}
        GEOID: ${{escapeHtml(p.geoid)}}
      `);
@@ -193,12 +201,12 @@ def render_html(points, center_lat, center_lon, output_path):
      sourceStats.appendChild(div);
    }}
-    const precisionStats = document.getElementById('precisionStats');
+    const matchStats = document.getElementById('matchStats');
-    for (const [k, v] of Object.entries(stats.by_precision)) {{
+    for (const [k, v] of Object.entries(stats.by_match_method)) {{
      const div = document.createElement('div');
      div.className = 'stat-row';
      div.innerHTML = `<span>${{escapeHtml(k)}}</span><strong>${{v}}</strong>`;
-      precisionStats.appendChild(div);
+      matchStats.appendChild(div);
    }}
  </script>
 </body>
--- a/make_internet_cables_map.py
+++ b/make_internet_cables_map.py
@@ -10,7 +10,7 @@ import psycopg2
 DB_NAME = "data_centers"
-DC_TABLE = "public.us_dc_sample_geocoded"
+DC_TABLE = "public.master_data_centers"
 CABLES_TABLE = "public.internet_cables"
 CITY_TABLE = "public.internet_city_dominance"
@@ -30,14 +30,14 @@ def load_data_centers(conn):
        cur.execute(
            f"""
            select
-                id,
+                master_id,
-                coalesce(provider, ''),
+                source,
-                coalesce(facility_name, ''),
+                coalesce(operator, ''),
                coalesce(name, ''),
                coalesce(city, ''),
-                coalesce(state_code, ''),
+                coalesce(state, ''),
                longitude,
-                latitude,
+                latitude
                coalesce(geocode_source, '')
            from {DC_TABLE}
            where longitude is not null and latitude is not null
            """
@@ -45,13 +45,13 @@ def load_data_centers(conn):
        return [
            {
                "id": r[0],
-                "provider": r[1],
+                "source": r[1],
-                "facility_name": r[2],
+                "operator": r[2],
-                "city": r[3],
+                "name": r[3],
-                "state_code": r[4],
+                "city": r[4],
-                "lon": float(r[5]),
+                "state": r[5],
-                "lat": float(r[6]),
+                "lon": float(r[6]),
-                "geocode_source": r[7],
+                "lat": float(r[7]),
            }
            for r in cur.fetchall()
        ]
@@ -181,10 +181,10 @@ def render_html(data_centers, cables_geojson, cities, output_path):
      <label class="toggle"><input type="checkbox" id="tCities" checked> City dominance</label>
      <h2>Data center source</h2>
-      <div class="row"><span><span class="swatch" style="background:#1f77b4"></span>IM3_Existing_DataCenters</span></div>
+      <div class="row"><span><span class="swatch" style="background:#2ca02c"></span>merged (curated + OSM)</span></div>
-      <div class="row"><span><span class="swatch" style="background:#2ca02c"></span>US Census Geocoder</span></div>
+      <div class="row"><span><span class="swatch" style="background:#1f77b4"></span>curated only</span></div>
-      <div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>Nominatim/OpenStreetMap</span></div>
+      <div class="row"><span><span class="swatch" style="background:#ff7f0e"></span>osm only</span></div>
-      <div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>Other</span></div>
+      <div class="row"><span><span class="swatch" style="background:#7f7f7f"></span>other</span></div>
      <h2>City dominance</h2>
      <div class="row"><span><span class="swatch" style="background:#9b59b6;border-radius:50%"></span>Sized by physical Tbps</span></div>
@@ -197,9 +197,9 @@ def render_html(data_centers, cables_geojson, cities, output_path):
    const DATA = __PAYLOAD__;
    function colorForSource(source) {
-      if (source === 'IM3_Existing_DataCenters') return '#1f77b4';
+      if (source === 'merged') return '#2ca02c';
-      if (source === 'US Census Geocoder') return '#2ca02c';
+      if (source === 'curated') return '#1f77b4';
-      if (source === 'Nominatim/OpenStreetMap') return '#ff7f0e';
+      if (source === 'osm') return '#ff7f0e';
      return '#7f7f7f';
    }
@@ -262,19 +262,19 @@ def render_html(data_centers, cables_geojson, cities, output_path):
    for (const p of DATA.data_centers) {
      const m = L.circleMarker([p.lat, p.lon], {
        radius: 3,
-        color: colorForSource(p.geocode_source),
+        color: colorForSource(p.source),
-        fillColor: colorForSource(p.geocode_source),
+        fillColor: colorForSource(p.source),
        fillOpacity: 0.85,
        weight: 0.8,
      });
-      const title = p.facility_name || p.id;
+      const title = p.name || p.id;
-      const provider = p.provider || '(unknown provider)';
+      const operator = p.operator || '(unknown operator)';
-      const cityState = [p.city, p.state_code].filter(Boolean).join(', ');
+      const cityState = [p.city, p.state].filter(Boolean).join(', ');
      m.bindPopup(`
        <strong>${esc(title)}</strong><br>
-        Provider: ${esc(provider)}<br>
+        Operator: ${esc(operator)}<br>
        Location: ${esc(cityState)}<br>
-        Source: ${esc(p.geocode_source)}
+        Source: ${esc(p.source)}
      `);
      dcLayer.addLayer(m);
      dcBounds.push([p.lat, p.lon]);
--- a/output/overpass_building_data_center.json
+++ b/output/overpass_building_data_center.json
--- a/output/overpass_telecom_data_center.json
+++ b/output/overpass_telecom_data_center.json