Reorganize project into scripts/, docs/, data/, output/ directories

Move all Python scripts to scripts/, documentation to docs/, raw input data to data/, and generated HTML/CSV outputs to output/. Update path references in 8 scripts to use Path(__file__).parent.parent as project root so they work correctly from the new location. Update README links and quick-start commands accordingly. Notebooks remain at root. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 21:57:22 -07:00
parent a2e295d95b
commit ee5856661a
40 changed files with 31 additions and 30 deletions
--- a/scripts/load_postgis_osm_data_centers.py
+++ b/scripts/load_postgis_osm_data_centers.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+Fetch US data centers from OpenStreetMap (Overpass API) and load them into
+public.osm_data_centers in the data_centers database. Also (re)creates a
+unioned view public.data_centers_union combining OSM + curated rows from
+public.us_dc_sample_geocoded.
+
+Two Overpass passes are made because tagging is inconsistent:
+  1) telecom=data_center
+  2) building=data_center
+
+Results are deduplicated by (osm_type, osm_id); the matched tag-pass is recorded
+in match_tags so we can see which query found each feature.
+"""
+import argparse
+import json
+import os
+import sys
+import time
+from typing import Dict, List, Optional, Tuple
+
+import psycopg2
+import requests
+from psycopg2.extras import Json, execute_values
+
+OVERPASS_URL = "https://overpass-api.de/api/interpreter"
+TABLE = "public.osm_data_centers"
+VIEW = "public.data_centers_union"
+CURATED_TABLE = "public.us_dc_sample_geocoded"
+DB_NAME = "data_centers"
+
+# Tag passes: (key, value)
+TAG_PASSES = [
+    ("telecom", "data_center"),
+    ("building", "data_center"),
+]
+
+
+def overpass_query(tag_key: str, tag_value: str, timeout: int = 180) -> str:
+    return f"""
+[out:json][timeout:{timeout}];
+area["ISO3166-1"="US"][admin_level=2]->.us;
+(
+  node["{tag_key}"="{tag_value}"](area.us);
+  way["{tag_key}"="{tag_value}"](area.us);
+  relation["{tag_key}"="{tag_value}"](area.us);
+);
+out center tags;
+""".strip()
+
+
+def fetch_pass(tag_key: str, tag_value: str, cache_path: Optional[str]) -> List[dict]:
+    if cache_path and os.path.exists(cache_path):
+        print(f"  using cached response: {cache_path}")
+        with open(cache_path, "r", encoding="utf-8") as fh:
+            payload = json.load(fh)
+    else:
+        query = overpass_query(tag_key, tag_value)
+        print(f"  querying Overpass for {tag_key}={tag_value} ...")
+        headers = {
+            "User-Agent": "us-data-centers-inventory/1.0 (research; contact david@dadams.io)",
+            "Accept": "application/json",
+        }
+        resp = requests.post(
+            OVERPASS_URL,
+            data={"data": query},
+            headers=headers,
+            timeout=240,
+        )
+        if resp.status_code != 200:
+            print(f"  Overpass returned {resp.status_code}: {resp.text[:500]}")
+        resp.raise_for_status()
+        payload = resp.json()
+        if cache_path:
+            with open(cache_path, "w", encoding="utf-8") as fh:
+                json.dump(payload, fh)
+            print(f"  cached to {cache_path}")
+    elements = payload.get("elements", [])
+    print(f"  pass returned {len(elements)} elements")
+    return elements
+
+
+def element_coords(elem: dict) -> Tuple[Optional[float], Optional[float]]:
+    if elem.get("type") == "node":
+        return elem.get("lon"), elem.get("lat")
+    center = elem.get("center") or {}
+    return center.get("lon"), center.get("lat")
+
+
+def normalize_element(elem: dict, matched_tag: str) -> Optional[dict]:
+    lon, lat = element_coords(elem)
+    if lon is None or lat is None:
+        return None
+    osm_type = elem.get("type")
+    osm_id = elem.get("id")
+    if osm_type is None or osm_id is None:
+        return None
+    tags = elem.get("tags") or {}
+    return {
+        "id": f"{osm_type}/{osm_id}",
+        "osm_type": osm_type,
+        "osm_id": int(osm_id),
+        "name": tags.get("name"),
+        "operator": tags.get("operator"),
+        "operator_type": tags.get("operator:type"),
+        "telecom": tags.get("telecom"),
+        "building": tags.get("building"),
+        "power": tags.get("power"),
+        "website": tags.get("website") or tags.get("contact:website"),
+        "phone": tags.get("phone") or tags.get("contact:phone"),
+        "street_address": " ".join(
+            part for part in (tags.get("addr:housenumber"), tags.get("addr:street")) if part
+        ) or None,
+        "city": tags.get("addr:city"),
+        "state": tags.get("addr:state"),
+        "postal_code": tags.get("addr:postcode"),
+        "country": tags.get("addr:country") or "US",
+        "matched_tags": [matched_tag],
+        "tags": tags,
+        "longitude": float(lon),
+        "latitude": float(lat),
+    }
+
+
+def merge_records(existing: Dict[str, dict], new_rows: List[dict]) -> None:
+    for row in new_rows:
+        key = row["id"]
+        prior = existing.get(key)
+        if prior is None:
+            existing[key] = row
+            continue
+        # merge matched_tags; keep first non-null values for other fields
+        merged_tags = list(dict.fromkeys(prior["matched_tags"] + row["matched_tags"]))
+        prior["matched_tags"] = merged_tags
+        for col, val in row.items():
+            if col == "matched_tags":
+                continue
+            if prior.get(col) in (None, "") and val not in (None, ""):
+                prior[col] = val
+
+
+COLUMNS = [
+    "id",
+    "osm_type",
+    "osm_id",
+    "name",
+    "operator",
+    "operator_type",
+    "telecom",
+    "building",
+    "power",
+    "website",
+    "phone",
+    "street_address",
+    "city",
+    "state",
+    "postal_code",
+    "country",
+    "matched_tags",
+    "tags",
+    "longitude",
+    "latitude",
+]
+
+
+def row_to_tuple(row: dict) -> tuple:
+    return (
+        row["id"],
+        row["osm_type"],
+        row["osm_id"],
+        row.get("name"),
+        row.get("operator"),
+        row.get("operator_type"),
+        row.get("telecom"),
+        row.get("building"),
+        row.get("power"),
+        row.get("website"),
+        row.get("phone"),
+        row.get("street_address"),
+        row.get("city"),
+        row.get("state"),
+        row.get("postal_code"),
+        row.get("country"),
+        row.get("matched_tags", []),
+        Json(row.get("tags", {})),
+        row["longitude"],
+        row["latitude"],
+    )
+
+
+def create_table(cur) -> None:
+    cur.execute(
+        f"""
+        create table {TABLE} (
+            id text primary key,
+            osm_type text not null,
+            osm_id bigint not null,
+            name text,
+            operator text,
+            operator_type text,
+            telecom text,
+            building text,
+            power text,
+            website text,
+            phone text,
+            street_address text,
+            city text,
+            state text,
+            postal_code text,
+            country text,
+            matched_tags text[] not null default '{{}}',
+            tags jsonb not null default '{{}}'::jsonb,
+            longitude double precision not null,
+            latitude double precision not null,
+            ingested_at timestamptz not null default now(),
+            geom geometry(Point, 4326) generated always as
+                (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)) stored
+        )
+        """
+    )
+    cur.execute(f"create index osm_data_centers_geom_gix on {TABLE} using gist (geom)")
+    cur.execute(f"create index osm_data_centers_state_idx on {TABLE} (state)")
+    cur.execute(f"create index osm_data_centers_tags_gin on {TABLE} using gin (tags)")
+
+
+def insert_values(cur, rows: List[dict], upsert: bool) -> None:
+    sql = f"insert into {TABLE} ({', '.join(COLUMNS)}) values %s"
+    if upsert:
+        update_cols = [c for c in COLUMNS if c != "id"]
+        assignments = ", ".join(f"{c} = excluded.{c}" for c in update_cols)
+        sql += (
+            f" on conflict (id) do update set {assignments}, "
+            f"ingested_at = now()"
+        )
+    execute_values(cur, sql, [row_to_tuple(r) for r in rows], page_size=200)
+
+
+def create_or_replace_view(cur) -> None:
+    cur.execute(
+        f"""
+        create or replace view {VIEW} as
+        select
+            'curated/' || id as id,
+            'curated'::text as source,
+            facility_name as name,
+            provider as operator,
+            street_address,
+            city,
+            state_code as state,
+            postal_code,
+            country,
+            url as website,
+            phone,
+            longitude,
+            latitude,
+            geom
+        from {CURATED_TABLE}
+        union all
+        select
+            id,
+            'osm'::text as source,
+            name,
+            operator,
+            street_address,
+            city,
+            state,
+            postal_code,
+            country,
+            website,
+            phone,
+            longitude,
+            latitude,
+            geom
+        from {TABLE}
+        """
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--cache-dir",
+        default="output",
+        help="Directory to cache raw Overpass responses (default: output/).",
+    )
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Do not read or write Overpass cache files; always hit the API.",
+    )
+    parser.add_argument(
+        "--recreate",
+        action="store_true",
+        help=f"Drop and recreate {TABLE} before loading.",
+    )
+    parser.add_argument(
+        "--upsert",
+        action="store_true",
+        default=True,
+        help="On id conflicts, update the existing row (default: on).",
+    )
+    parser.add_argument(
+        "--skip-view",
+        action="store_true",
+        help=f"Do not create/replace the unioned view {VIEW}.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    os.makedirs(args.cache_dir, exist_ok=True)
+    merged: Dict[str, dict] = {}
+    for tag_key, tag_value in TAG_PASSES:
+        cache_path = (
+            None
+            if args.no_cache
+            else os.path.join(args.cache_dir, f"overpass_{tag_key}_{tag_value}.json")
+        )
+        print(f"Pass: {tag_key}={tag_value}")
+        elements = fetch_pass(tag_key, tag_value, cache_path)
+        normalized = [
+            row for row in (normalize_element(e, f"{tag_key}={tag_value}") for e in elements)
+            if row is not None
+        ]
+        print(f"  normalized {len(normalized)} rows with coords")
+        merge_records(merged, normalized)
+        # be polite to Overpass between passes
+        time.sleep(2)
+
+    rows = list(merged.values())
+    print(f"Total deduped OSM data-center features: {len(rows)}")
+    if not rows:
+        print("No rows fetched; aborting DB load.", file=sys.stderr)
+        return 1
+
+    conn = psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname=DB_NAME,
+    )
+    try:
+        with conn:
+            with conn.cursor() as cur:
+                cur.execute("create extension if not exists postgis")
+                if args.recreate:
+                    cur.execute(f"drop table if exists {TABLE} cascade")
+                cur.execute("select to_regclass(%s)", (TABLE,))
+                if cur.fetchone()[0] is None:
+                    create_table(cur)
+                insert_values(cur, rows, upsert=args.upsert)
+                cur.execute(f"analyze {TABLE}")
+                if not args.skip_view:
+                    cur.execute("select to_regclass(%s)", (CURATED_TABLE,))
+                    if cur.fetchone()[0] is not None:
+                        create_or_replace_view(cur)
+                        print(f"View {VIEW} (re)created.")
+                    else:
+                        print(
+                            f"Skipping view: {CURATED_TABLE} does not exist.",
+                            file=sys.stderr,
+                        )
+                cur.execute(f"select count(*) from {TABLE}")
+                total = cur.fetchone()[0]
+    finally:
+        conn.close()
+
+    print(f"Loaded {len(rows)} rows into {TABLE}; table now has {total} rows total.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())