data-centers/spatial_clustering_master_data_centers.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Spatial Clustering of Master Data Centers\n",
    "\n",
    "This notebook analyzes spatial clustering in `public.master_data_centers`, the merged curated + OSM table built by `build_master_data_centers.py`.\n",
    "\n",
    "Main workflow:\n",
    "- Load data-center points from PostGIS.\n",
    "- Diagnose nearest-neighbor distances and spatial concentration.\n",
    "- Run DBSCAN with haversine distances so distances are in kilometers rather than degrees.\n",
    "- Sweep clustering parameters to check sensitivity.\n",
    "- Summarize clusters by geography, source, operator, and scale.\n",
    "- Optionally write cluster labels back to PostGIS for downstream mapping or joins.\n",
    "\n",
    "Expected environment variables: `PGWEB_HOST`, `PGWEB_PORT`, `PGWEB_USER`, `PGWEB_PASSWORD`. Optional: `PGDATABASE`, which defaults to `data_centers`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from pathlib import Path\n",
    "os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')\n",
    "Path(os.environ['MPLCONFIGDIR']).mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import psycopg2\n",
    "from psycopg2 import sql\n",
    "from psycopg2.extras import execute_values\n",
    "\n",
    "try:\n",
    "    import matplotlib.pyplot as plt\n",
    "    HAS_MATPLOTLIB = True\n",
    "except ImportError:\n",
    "    plt = None\n",
    "    HAS_MATPLOTLIB = False\n",
    "\n",
    "pd.set_option('display.max_columns', 80)\n",
    "pd.set_option('display.max_rows', 120)\n",
    "if HAS_MATPLOTLIB:\n",
    "    plt.rcParams['figure.figsize'] = (10, 6)\n",
    "\n",
    "EARTH_RADIUS_KM = 6371.0088\n",
    "\n",
    "print('pandas:', pd.__version__)\n",
    "print('numpy:', np.__version__)\n",
    "print('matplotlib:', 'ok' if HAS_MATPLOTLIB else 'not installed; plot cells will be skipped')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_env_file(env_path: str = '.env') -> None:\n",
    "    p = Path(env_path)\n",
    "    if not p.exists():\n",
    "        print(f'No {env_path} file found in {Path.cwd()}')\n",
    "        return\n",
    "\n",
    "    loaded = 0\n",
    "    for raw_line in p.read_text(encoding='utf-8').splitlines():\n",
    "        line = raw_line.strip()\n",
    "        if not line or line.startswith('#') or '=' not in line:\n",
    "            continue\n",
    "        key, value = line.split('=', 1)\n",
    "        key = key.strip()\n",
    "        value = value.strip().strip('\"').strip(\"'\")\n",
    "        if key and key not in os.environ:\n",
    "            os.environ[key] = value\n",
    "            loaded += 1\n",
    "    print(f'Loaded {loaded} env var(s) from {env_path}')\n",
    "\n",
    "\n",
    "def require_env(keys):\n",
    "    missing = [k for k in keys if not os.getenv(k)]\n",
    "    if missing:\n",
    "        raise EnvironmentError(\n",
    "            'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n",
    "            '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n",
    "        )\n",
    "\n",
    "\n",
    "load_env_file('.env')\n",
    "print('PGWEB_HOST:', os.getenv('PGWEB_HOST', '<not set>'))\n",
    "print('PGWEB_PORT:', os.getenv('PGWEB_PORT', '<not set>'))\n",
    "print('PGWEB_USER:', os.getenv('PGWEB_USER', '<not set>'))\n",
    "\n",
    "# Connection setup: mirrors postgis_table_loader.ipynb and the existing scripts in this repository.\n",
    "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n",
    "require_env(required_keys)\n",
    "\n",
    "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n",
    "MASTER_TABLE = 'public.master_data_centers'\n",
    "CLUSTER_OUTPUT_TABLE = 'public.master_data_center_spatial_clusters'\n",
    "\n",
    "\n",
    "def get_conn():\n",
    "    return psycopg2.connect(\n",
    "        host=os.environ['PGWEB_HOST'],\n",
    "        port=os.environ['PGWEB_PORT'],\n",
    "        user=os.environ['PGWEB_USER'],\n",
    "        password=os.environ['PGWEB_PASSWORD'],\n",
    "        dbname=\"data_centers\",\n",
    "    )\n",
    "\n",
    "\n",
    "with get_conn() as conn:\n",
    "    with conn.cursor() as cur:\n",
    "        cur.execute('select current_database(), current_user, version()')\n",
    "        db, usr, ver = cur.fetchone()\n",
    "        print('Connected to DB:', db)\n",
    "        print('As user:', usr)\n",
    "        print('Postgres:', ver.split(',')[0])\n",
    "        cur.execute('create extension if not exists postgis')\n",
    "        print('PostGIS extension is available')\n",
    "        cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n",
    "        if cur.fetchone()[0] is None:\n",
    "            raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n",
    "        cur.execute('select count(*) from public.master_data_centers')\n",
    "        print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {},
   "source": [
    "## Analysis Parameters\n",
    "\n",
    "`EPS_KM` is the neighborhood radius for DBSCAN. `MIN_SAMPLES` is the minimum number of data centers needed in that radius to form a dense cluster. Treat these as research choices: the parameter sweep below helps show how stable the cluster structure is."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "EPS_KM = 25\n",
    "MIN_SAMPLES = 5\n",
    "PROJECTION_SRID = 5070  # NAD83 / Conus Albers; meters, good for contiguous US clustering.\n",
    "\n",
    "# Lower 48-ish bounding box. Set to False if you want Alaska, Hawaii, PR, etc. included.\n",
    "FILTER_CONTIGUOUS_US = True\n",
    "\n",
    "SWEEP_EPS_KM = [2, 5, 10, 15, 25, 50, 100]\n",
    "SWEEP_MIN_SAMPLES = [3, 5, 10]\n",
    "\n",
    "print(f'DBSCAN eps={EPS_KM} km, min_samples={MIN_SAMPLES}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def master_points_where(alias: str = '') -> str:\n",
    "    prefix = f'{alias}.' if alias else ''\n",
    "    clauses = [\n",
    "        f'{prefix}longitude is not null',\n",
    "        f'{prefix}latitude is not null',\n",
    "        f'{prefix}geom is not null',\n",
    "    ]\n",
    "    if FILTER_CONTIGUOUS_US:\n",
    "        clauses.extend([\n",
    "            f'{prefix}longitude between -125 and -66',\n",
    "            f'{prefix}latitude between 24 and 50',\n",
    "        ])\n",
    "    return ' and '.join(clauses)\n",
    "\n",
    "\n",
    "sql_query = f\"\"\"\n",
    "select\n",
    "    master_id,\n",
    "    source,\n",
    "    curated_id,\n",
    "    osm_id,\n",
    "    name,\n",
    "    operator,\n",
    "    city,\n",
    "    state,\n",
    "    country,\n",
    "    power_mw,\n",
    "    area_sqft,\n",
    "    has_bare_metal,\n",
    "    has_iaas,\n",
    "    has_internet_exchange,\n",
    "    has_colocation,\n",
    "    longitude,\n",
    "    latitude,\n",
    "    ST_AsText(geom) as geom_wkt\n",
    "from public.master_data_centers\n",
    "where {master_points_where()}\n",
    "\"\"\"\n",
    "\n",
    "with get_conn() as conn:\n",
    "    dc = pd.read_sql_query(sql_query, conn)\n",
    "\n",
    "dc['longitude'] = pd.to_numeric(dc['longitude'], errors='coerce')\n",
    "dc['latitude'] = pd.to_numeric(dc['latitude'], errors='coerce')\n",
    "dc = dc.dropna(subset=['longitude', 'latitude']).copy()\n",
    "\n",
    "dc['state'] = dc['state'].fillna('UNK').replace('', 'UNK')\n",
    "dc['operator'] = dc['operator'].fillna('Unknown').replace('', 'Unknown')\n",
    "dc['city'] = dc['city'].fillna('Unknown').replace('', 'Unknown')\n",
    "\n",
    "print(f'Loaded {len(dc):,} points for clustering')\n",
    "display(dc.head())\n",
    "display(dc['source'].value_counts(dropna=False).rename_axis('source').reset_index(name='rows'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "## Baseline Geography\n",
    "\n",
    "Before clustering, inspect where the observations are and how concentrated they already are by state/city/source."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_counts = (\n",
    "    dc.groupby('state', dropna=False)\n",
    "      .size()\n",
    "      .sort_values(ascending=False)\n",
    "      .rename('data_centers')\n",
    "      .reset_index()\n",
    ")\n",
    "display(state_counts.head(25))\n",
    "\n",
    "city_counts = (\n",
    "    dc.groupby(['state', 'city'], dropna=False)\n",
    "      .size()\n",
    "      .sort_values(ascending=False)\n",
    "      .rename('data_centers')\n",
    "      .reset_index()\n",
    ")\n",
    "display(city_counts.head(25))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "if HAS_MATPLOTLIB:\n",
    "    fig, ax = plt.subplots(figsize=(11, 7))\n",
    "    for source, group in dc.groupby('source'):\n",
    "        ax.scatter(group['longitude'], group['latitude'], s=12, alpha=0.65, label=source)\n",
    "    ax.set_title('Master Data Centers by Source')\n",
    "    ax.set_xlabel('Longitude')\n",
    "    ax.set_ylabel('Latitude')\n",
    "    ax.legend(title='source')\n",
    "    ax.grid(True, linewidth=0.3, alpha=0.4)\n",
    "    plt.show()\n",
    "else:\n",
    "    print('matplotlib is not installed; skipping static source scatter plot.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "## Nearest-Neighbor Distance Diagnostics\n",
    "\n",
    "Nearest-neighbor distances are useful for choosing DBSCAN parameters. If many points have a nearest neighbor inside 5-25 km, those radii will detect metro-scale clusters; larger radii begin to merge nearby metros and corridors. Distances are computed in PostGIS using geography meters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "nearest_neighbor_sql = f\"\"\"\n",
    "with points as (\n",
    "    select\n",
    "        master_id,\n",
    "        geom,\n",
    "        ST_Transform(geom, %s) as geom_m\n",
    "    from public.master_data_centers\n",
    "    where {master_points_where()}\n",
    ")\n",
    "select\n",
    "    p.master_id,\n",
    "    q.master_id as nearest_neighbor_id,\n",
    "    ST_Distance(p.geom::geography, q.geom::geography) / 1000.0 as nearest_neighbor_km\n",
    "from points p\n",
    "left join lateral (\n",
    "    select q.master_id, q.geom\n",
    "    from points q\n",
    "    where q.master_id <> p.master_id\n",
    "    order by p.geom_m <-> q.geom_m\n",
    "    limit 1\n",
    ") q on true\n",
    "\"\"\"\n",
    "\n",
    "with get_conn() as conn:\n",
    "    nn = pd.read_sql_query(nearest_neighbor_sql, conn, params=(PROJECTION_SRID,))\n",
    "\n",
    "dc = dc.drop(columns=[c for c in ['nearest_neighbor_id', 'nearest_neighbor_km'] if c in dc.columns])\n",
    "dc = dc.merge(nn, on='master_id', how='left')\n",
    "\n",
    "display(dc['nearest_neighbor_km'].describe(percentiles=[.05, .10, .25, .50, .75, .90, .95, .99]).to_frame())\n",
    "\n",
    "thresholds = [1, 2, 5, 10, 25, 50, 100]\n",
    "nn_thresholds = pd.DataFrame({\n",
    "    'within_km': thresholds,\n",
    "    'share_of_points': [(dc['nearest_neighbor_km'] <= km).mean() for km in thresholds],\n",
    "    'point_count': [(dc['nearest_neighbor_km'] <= km).sum() for km in thresholds],\n",
    "})\n",
    "display(nn_thresholds)\n",
    "\n",
    "if HAS_MATPLOTLIB:\n",
    "    fig, ax = plt.subplots(figsize=(9, 5))\n",
    "    dc['nearest_neighbor_km'].clip(upper=200).hist(bins=50, ax=ax)\n",
    "    ax.axvline(EPS_KM, color='crimson', linewidth=2, label=f'EPS_KM={EPS_KM}')\n",
    "    ax.set_title('Nearest-Neighbor Distance Distribution (clipped at 200 km)')\n",
    "    ax.set_xlabel('Nearest neighbor distance, km')\n",
    "    ax.set_ylabel('Data centers')\n",
    "    ax.legend()\n",
    "    plt.show()\n",
    "else:\n",
    "    print('matplotlib is not installed; skipping nearest-neighbor histogram.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "## DBSCAN Clustering\n",
    "\n",
    "DBSCAN is a good first-pass method here because it finds irregularly shaped dense groups and labels isolated points as noise. The implementation below uses PostGIS `ST_ClusterDBSCAN` on `ST_Transform(geom, 5070)`, so `eps` is specified in meters after projecting to a continental-US equal-area CRS."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dbscan_labels(eps_km: float, min_samples: int) -> pd.DataFrame:\n",
    "    cluster_sql = f\"\"\"\n",
    "    with points as (\n",
    "        select\n",
    "            master_id,\n",
    "            ST_Transform(geom, %s) as geom_m\n",
    "        from public.master_data_centers\n",
    "        where {master_points_where()}\n",
    "    ), clustered as (\n",
    "        select\n",
    "            master_id,\n",
    "            ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as raw_cluster_id\n",
    "        from points\n",
    "    )\n",
    "    select\n",
    "        master_id,\n",
    "        coalesce(raw_cluster_id, -1) as cluster_id\n",
    "    from clustered\n",
    "    \"\"\"\n",
    "    with get_conn() as conn:\n",
    "        labels = pd.read_sql_query(\n",
    "            cluster_sql,\n",
    "            conn,\n",
    "            params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n",
    "        )\n",
    "    labels['cluster_id'] = labels['cluster_id'].astype(int)\n",
    "    return labels\n",
    "\n",
    "\n",
    "labels = load_dbscan_labels(EPS_KM, MIN_SAMPLES)\n",
    "dc = dc.drop(columns=[c for c in ['cluster_id', 'is_noise'] if c in dc.columns])\n",
    "dc = dc.merge(labels, on='master_id', how='left')\n",
    "dc['cluster_id'] = dc['cluster_id'].fillna(-1).astype(int)\n",
    "dc['is_noise'] = dc['cluster_id'].eq(-1)\n",
    "\n",
    "n_clusters = dc.loc[~dc['is_noise'], 'cluster_id'].nunique()\n",
    "n_noise = int(dc['is_noise'].sum())\n",
    "print(f'Clusters found: {n_clusters:,}')\n",
    "print(f'Noise / isolated points: {n_noise:,} ({n_noise / len(dc):.1%})')\n",
    "display(dc['cluster_id'].value_counts().head(20).rename_axis('cluster_id').reset_index(name='points'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "def haversine_km(lat1, lon1, lat2, lon2):\n",
    "    lat1 = np.radians(lat1)\n",
    "    lon1 = np.radians(lon1)\n",
    "    lat2 = np.radians(lat2)\n",
    "    lon2 = np.radians(lon2)\n",
    "    dlat = lat2 - lat1\n",
    "    dlon = lon2 - lon1\n",
    "    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2\n",
    "    return 2 * EARTH_RADIUS_KM * np.arcsin(np.sqrt(a))\n",
    "\n",
    "\n",
    "def mode_list(values, n=5):\n",
    "    s = pd.Series(values).dropna().astype(str)\n",
    "    s = s[s.ne('')]\n",
    "    if s.empty:\n",
    "        return ''\n",
    "    return '; '.join(f'{k} ({v})' for k, v in s.value_counts().head(n).items())\n",
    "\n",
    "\n",
    "cluster_summary_columns = [\n",
    "    'cluster_id', 'point_count', 'centroid_latitude', 'centroid_longitude',\n",
    "    'radius_km_p50', 'radius_km_p90', 'radius_km_max', 'states', 'cities',\n",
    "    'sources', 'operators', 'total_power_mw_known', 'known_power_rows',\n",
    "    'total_area_sqft_known', 'known_area_rows',\n",
    "]\n",
    "\n",
    "cluster_rows = []\n",
    "for cid, group in dc.loc[~dc['is_noise']].groupby('cluster_id'):\n",
    "    centroid_lat = group['latitude'].mean()\n",
    "    centroid_lon = group['longitude'].mean()\n",
    "    distances = haversine_km(group['latitude'], group['longitude'], centroid_lat, centroid_lon)\n",
    "    cluster_rows.append({\n",
    "        'cluster_id': int(cid),\n",
    "        'point_count': len(group),\n",
    "        'centroid_latitude': centroid_lat,\n",
    "        'centroid_longitude': centroid_lon,\n",
    "        'radius_km_p50': float(np.percentile(distances, 50)),\n",
    "        'radius_km_p90': float(np.percentile(distances, 90)),\n",
    "        'radius_km_max': float(np.max(distances)),\n",
    "        'states': mode_list(group['state'], n=8),\n",
    "        'cities': mode_list(group['city'], n=8),\n",
    "        'sources': mode_list(group['source'], n=4),\n",
    "        'operators': mode_list(group['operator'], n=8),\n",
    "        'total_power_mw_known': group['power_mw'].dropna().sum(),\n",
    "        'known_power_rows': group['power_mw'].notna().sum(),\n",
    "        'total_area_sqft_known': group['area_sqft'].dropna().sum(),\n",
    "        'known_area_rows': group['area_sqft'].notna().sum(),\n",
    "    })\n",
    "\n",
    "if cluster_rows:\n",
    "    clusters = pd.DataFrame(cluster_rows).sort_values(['point_count', 'radius_km_p90'], ascending=[False, True])\n",
    "else:\n",
    "    clusters = pd.DataFrame(columns=cluster_summary_columns)\n",
    "\n",
    "display(clusters.head(30))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "if HAS_MATPLOTLIB:\n",
    "    fig, ax = plt.subplots(figsize=(12, 8))\n",
    "    noise = dc['is_noise']\n",
    "    ax.scatter(dc.loc[noise, 'longitude'], dc.loc[noise, 'latitude'], s=8, c='lightgray', alpha=0.55, label='noise')\n",
    "\n",
    "    clustered = dc.loc[~noise].copy()\n",
    "    scatter = ax.scatter(\n",
    "        clustered['longitude'],\n",
    "        clustered['latitude'],\n",
    "        c=clustered['cluster_id'],\n",
    "        cmap='tab20',\n",
    "        s=18,\n",
    "        alpha=0.85,\n",
    "        label='clustered',\n",
    "    )\n",
    "    ax.set_title(f'DBSCAN Data-Center Clusters (eps={EPS_KM} km, min_samples={MIN_SAMPLES})')\n",
    "    ax.set_xlabel('Longitude')\n",
    "    ax.set_ylabel('Latitude')\n",
    "    ax.grid(True, linewidth=0.3, alpha=0.4)\n",
    "    plt.colorbar(scatter, ax=ax, label='cluster_id')\n",
    "    plt.show()\n",
    "else:\n",
    "    print('matplotlib is not installed; skipping static cluster scatter plot.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15",
   "metadata": {},
   "source": [
    "## Parameter Sensitivity\n",
    "\n",
    "The sweep below asks whether the cluster story depends heavily on a single `eps` or `min_samples` choice."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dbscan_summary(eps_km: float, min_samples: int) -> dict:\n",
    "    summary_sql = f\"\"\"\n",
    "    with points as (\n",
    "        select master_id, ST_Transform(geom, %s) as geom_m\n",
    "        from public.master_data_centers\n",
    "        where {master_points_where()}\n",
    "    ), labels as (\n",
    "        select ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as cid\n",
    "        from points\n",
    "    ), cluster_sizes as (\n",
    "        select cid, count(*)::integer as cluster_size\n",
    "        from labels\n",
    "        where cid is not null\n",
    "        group by cid\n",
    "    )\n",
    "    select\n",
    "        (select count(*)::integer from labels) as points,\n",
    "        (select count(distinct cid)::integer from labels where cid is not null) as clusters,\n",
    "        (select count(*)::integer from labels where cid is not null) as clustered_points,\n",
    "        (select count(*)::integer from labels where cid is null) as noise_points,\n",
    "        coalesce((select max(cluster_size)::integer from cluster_sizes), 0) as largest_cluster,\n",
    "        coalesce((select percentile_cont(0.5) within group (order by cluster_size) from cluster_sizes), 0) as median_cluster_size\n",
    "    \"\"\"\n",
    "    with get_conn() as conn:\n",
    "        row = pd.read_sql_query(\n",
    "            summary_sql,\n",
    "            conn,\n",
    "            params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n",
    "        ).iloc[0].to_dict()\n",
    "    row['eps_km'] = eps_km\n",
    "    row['min_samples'] = min_samples\n",
    "    row['clustered_share'] = row['clustered_points'] / row['points'] if row['points'] else 0\n",
    "    return row\n",
    "\n",
    "\n",
    "sweep_rows = []\n",
    "for eps in SWEEP_EPS_KM:\n",
    "    for min_samples in SWEEP_MIN_SAMPLES:\n",
    "        sweep_rows.append(dbscan_summary(eps, min_samples))\n",
    "\n",
    "sweep = pd.DataFrame(sweep_rows)\n",
    "display(sweep)\n",
    "\n",
    "if HAS_MATPLOTLIB:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(13, 5), sharex=True)\n",
    "    for min_samples, group in sweep.groupby('min_samples'):\n",
    "        axes[0].plot(group['eps_km'], group['clusters'], marker='o', label=f'min={min_samples}')\n",
    "        axes[1].plot(group['eps_km'], group['clustered_share'], marker='o', label=f'min={min_samples}')\n",
    "    axes[0].set_title('Cluster Count by eps')\n",
    "    axes[0].set_ylabel('clusters')\n",
    "    axes[1].set_title('Share of Points Clustered by eps')\n",
    "    axes[1].set_ylabel('clustered share')\n",
    "    for ax in axes:\n",
    "        ax.set_xlabel('eps, km')\n",
    "        ax.grid(True, linewidth=0.3, alpha=0.4)\n",
    "        ax.legend()\n",
    "    plt.show()\n",
    "else:\n",
    "    print('matplotlib is not installed; skipping parameter-sweep line charts.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {},
   "source": [
    "## Cluster Profiles\n",
    "\n",
    "Use these tables to interpret clusters as local agglomerations, metro regions, corridors, or source-specific artifacts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "if clusters.empty:\n",
    "    cluster_source = pd.DataFrame(columns=['cluster_id', 'point_count'])\n",
    "else:\n",
    "    cluster_source = (\n",
    "        dc.loc[~dc['is_noise']]\n",
    "          .pivot_table(index='cluster_id', columns='source', values='master_id', aggfunc='count', fill_value=0)\n",
    "          .reset_index()\n",
    "    )\n",
    "    cluster_source = clusters[['cluster_id', 'point_count']].merge(cluster_source, on='cluster_id', how='left')\n",
    "display(cluster_source.head(30))\n",
    "\n",
    "cluster_state = (\n",
    "    dc.loc[~dc['is_noise']]\n",
    "      .groupby(['cluster_id', 'state'])\n",
    "      .size()\n",
    "      .rename('points')\n",
    "      .reset_index()\n",
    "      .sort_values(['cluster_id', 'points'], ascending=[True, False])\n",
    ")\n",
    "display(cluster_state.head(80))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect the largest cluster in detail. Change CLUSTER_TO_INSPECT as needed.\n",
    "CLUSTER_TO_INSPECT = int(clusters.iloc[0]['cluster_id']) if len(clusters) else None\n",
    "\n",
    "if CLUSTER_TO_INSPECT is not None:\n",
    "    cols = [\n",
    "        'master_id', 'name', 'operator', 'city', 'state', 'source',\n",
    "        'longitude', 'latitude', 'power_mw', 'area_sqft', 'nearest_neighbor_km'\n",
    "    ]\n",
    "    detail = (\n",
    "        dc.loc[dc['cluster_id'].eq(CLUSTER_TO_INSPECT), cols]\n",
    "          .sort_values(['state', 'city', 'operator', 'name'], na_position='last')\n",
    "    )\n",
    "    print(f'Cluster {CLUSTER_TO_INSPECT}: {len(detail):,} points')\n",
    "    display(detail)\n",
    "else:\n",
    "    print('No clusters found with current parameters.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20",
   "metadata": {},
   "source": [
    "## Statistical Analysis\n",
    "\n",
    "This section treats the DBSCAN result as an analytical object: how concentrated are data centers across clusters, how different are clustered points from isolated points, and are some sources/states disproportionately represented in dense clusters?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper functions for concentration metrics and lightweight statistical tests.\n",
    "def gini(values):\n",
    "    x = pd.Series(values).dropna().astype(float)\n",
    "    x = x[x >= 0].sort_values().to_numpy()\n",
    "    if len(x) == 0 or x.sum() == 0:\n",
    "        return np.nan\n",
    "    n = len(x)\n",
    "    index = np.arange(1, n + 1)\n",
    "    return float((2 * np.sum(index * x)) / (n * np.sum(x)) - (n + 1) / n)\n",
    "\n",
    "\n",
    "def hhi_from_counts(values):\n",
    "    x = pd.Series(values).dropna().astype(float)\n",
    "    total = x.sum()\n",
    "    if total == 0:\n",
    "        return np.nan\n",
    "    shares = x / total\n",
    "    return float(np.sum(shares ** 2))\n",
    "\n",
    "\n",
    "def normalized_entropy_from_counts(values):\n",
    "    x = pd.Series(values).dropna().astype(float)\n",
    "    x = x[x > 0]\n",
    "    if len(x) <= 1:\n",
    "        return 0.0 if len(x) == 1 else np.nan\n",
    "    shares = x / x.sum()\n",
    "    return float(-(shares * np.log(shares)).sum() / np.log(len(shares)))\n",
    "\n",
    "\n",
    "def numeric_summary_by_group(frame, group_col, value_cols):\n",
    "    rows = []\n",
    "    for col in value_cols:\n",
    "        for group, vals in frame.groupby(group_col)[col]:\n",
    "            vals = pd.to_numeric(vals, errors='coerce').dropna()\n",
    "            rows.append({\n",
    "                'variable': col,\n",
    "                group_col: group,\n",
    "                'n': int(vals.size),\n",
    "                'mean': float(vals.mean()) if vals.size else np.nan,\n",
    "                'median': float(vals.median()) if vals.size else np.nan,\n",
    "                'p25': float(vals.quantile(0.25)) if vals.size else np.nan,\n",
    "                'p75': float(vals.quantile(0.75)) if vals.size else np.nan,\n",
    "                'p90': float(vals.quantile(0.90)) if vals.size else np.nan,\n",
    "            })\n",
    "    return pd.DataFrame(rows)\n",
    "\n",
    "\n",
    "def permutation_mean_diff(frame, value_col, group_col='is_clustered', reps=5000, seed=42):\n",
    "    work = frame[[value_col, group_col]].copy()\n",
    "    work[value_col] = pd.to_numeric(work[value_col], errors='coerce')\n",
    "    work = work.dropna()\n",
    "    if work[group_col].nunique() != 2:\n",
    "        return {'variable': value_col, 'observed_mean_diff': np.nan, 'p_two_sided': np.nan, 'reps': reps, 'n': len(work)}\n",
    "\n",
    "    y = work[value_col].to_numpy(dtype=float)\n",
    "    g = work[group_col].to_numpy(dtype=bool)\n",
    "    observed = y[g].mean() - y[~g].mean()\n",
    "    rng = np.random.default_rng(seed)\n",
    "    more_extreme = 0\n",
    "    for _ in range(reps):\n",
    "        gp = rng.permutation(g)\n",
    "        diff = y[gp].mean() - y[~gp].mean()\n",
    "        if abs(diff) >= abs(observed):\n",
    "            more_extreme += 1\n",
    "    p = (more_extreme + 1) / (reps + 1)\n",
    "    return {'variable': value_col, 'observed_mean_diff': float(observed), 'p_two_sided': float(p), 'reps': reps, 'n': len(work)}\n",
    "\n",
    "\n",
    "def chi_square_independence(table):\n",
    "    observed = table.to_numpy(dtype=float)\n",
    "    total = observed.sum()\n",
    "    if observed.size == 0 or total == 0:\n",
    "        return {'chi2': np.nan, 'dof': np.nan, 'p_value': np.nan, 'cramers_v': np.nan}\n",
    "    row_totals = observed.sum(axis=1, keepdims=True)\n",
    "    col_totals = observed.sum(axis=0, keepdims=True)\n",
    "    expected = row_totals @ col_totals / total\n",
    "    mask = expected > 0\n",
    "    chi2 = float(((observed - expected) ** 2 / np.where(mask, expected, 1)).sum())\n",
    "    dof = int((observed.shape[0] - 1) * (observed.shape[1] - 1))\n",
    "    denom = total * max(1, min(observed.shape[0] - 1, observed.shape[1] - 1))\n",
    "    cramers_v = float(np.sqrt(chi2 / denom)) if denom else np.nan\n",
    "\n",
    "    p_value = np.nan\n",
    "    try:\n",
    "        from scipy.stats import chi2 as scipy_chi2\n",
    "        p_value = float(scipy_chi2.sf(chi2, dof))\n",
    "    except ImportError:\n",
    "        pass\n",
    "    return {'chi2': chi2, 'dof': dof, 'p_value': p_value, 'cramers_v': cramers_v}\n",
    "\n",
    "\n",
    "def categorical_permutation_chi_square(frame, category_col, group_col='is_clustered', reps=5000, seed=42):\n",
    "    work = frame[[category_col, group_col]].dropna().copy()\n",
    "    if work.empty or work[category_col].nunique() < 2 or work[group_col].nunique() < 2:\n",
    "        return {'permutation_p_value': np.nan, 'permutation_reps': reps}\n",
    "\n",
    "    categories = pd.Index(sorted(work[category_col].astype(str).unique()))\n",
    "    groups = pd.Index(sorted(work[group_col].unique()))\n",
    "    observed_table = pd.crosstab(work[category_col].astype(str), work[group_col]).reindex(\n",
    "        index=categories, columns=groups, fill_value=0\n",
    "    )\n",
    "    observed_chi2 = chi_square_independence(observed_table)['chi2']\n",
    "\n",
    "    rng = np.random.default_rng(seed)\n",
    "    labels = work[group_col].to_numpy()\n",
    "    cats = work[category_col].astype(str).to_numpy()\n",
    "    more_extreme = 0\n",
    "    for _ in range(reps):\n",
    "        permuted = rng.permutation(labels)\n",
    "        table = pd.crosstab(cats, permuted).reindex(index=categories, columns=groups, fill_value=0)\n",
    "        stat = chi_square_independence(table)['chi2']\n",
    "        if stat >= observed_chi2:\n",
    "            more_extreme += 1\n",
    "    return {'permutation_p_value': float((more_extreme + 1) / (reps + 1)), 'permutation_reps': reps}\n",
    "\n",
    "\n",
    "stats_dir = Path('output')\n",
    "stats_dir.mkdir(exist_ok=True)\n",
    "dc['is_clustered'] = ~dc['is_noise']\n",
    "print(f'Clustered points: {dc[\"is_clustered\"].sum():,} / {len(dc):,} ({dc[\"is_clustered\"].mean():.1%})')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Overall concentration of data centers across DBSCAN clusters.\n",
    "cluster_sizes = dc.loc[dc['is_clustered'], 'cluster_id'].value_counts().sort_values(ascending=False)\n",
    "total_points = len(dc)\n",
    "total_clustered = int(cluster_sizes.sum())\n",
    "\n",
    "stat_rows = [\n",
    "    {'metric': 'total_points', 'value': total_points},\n",
    "    {'metric': 'clustered_points', 'value': total_clustered},\n",
    "    {'metric': 'noise_points', 'value': int((~dc['is_clustered']).sum())},\n",
    "    {'metric': 'clustered_share', 'value': dc['is_clustered'].mean()},\n",
    "    {'metric': 'cluster_count', 'value': int(cluster_sizes.size)},\n",
    "    {'metric': 'largest_cluster_points', 'value': int(cluster_sizes.iloc[0]) if len(cluster_sizes) else 0},\n",
    "    {'metric': 'largest_cluster_share_of_all_points', 'value': float(cluster_sizes.iloc[0] / total_points) if len(cluster_sizes) else 0},\n",
    "    {'metric': 'largest_cluster_share_of_clustered_points', 'value': float(cluster_sizes.iloc[0] / total_clustered) if total_clustered else 0},\n",
    "    {'metric': 'top_5_clusters_share_of_clustered_points', 'value': float(cluster_sizes.head(5).sum() / total_clustered) if total_clustered else 0},\n",
    "    {'metric': 'cluster_size_gini', 'value': gini(cluster_sizes)},\n",
    "    {'metric': 'cluster_size_hhi', 'value': hhi_from_counts(cluster_sizes)},\n",
    "    {'metric': 'cluster_size_normalized_entropy', 'value': normalized_entropy_from_counts(cluster_sizes)},\n",
    "]\n",
    "cluster_concentration_stats = pd.DataFrame(stat_rows)\n",
    "display(cluster_concentration_stats)\n",
    "\n",
    "cluster_concentration_stats.to_csv(stats_dir / 'master_data_center_cluster_concentration_stats.csv', index=False)\n",
    "print('Wrote:', stats_dir / 'master_data_center_cluster_concentration_stats.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare clustered vs. isolated/noise points.\n",
    "numeric_cols = ['nearest_neighbor_km', 'area_sqft', 'power_mw']\n",
    "clustered_vs_noise_numeric = numeric_summary_by_group(dc, 'is_clustered', numeric_cols)\n",
    "display(clustered_vs_noise_numeric)\n",
    "\n",
    "permutation_tests = pd.DataFrame([\n",
    "    permutation_mean_diff(dc, 'nearest_neighbor_km'),\n",
    "    permutation_mean_diff(dc, 'area_sqft'),\n",
    "    permutation_mean_diff(dc, 'power_mw'),\n",
    "])\n",
    "display(permutation_tests)\n",
    "\n",
    "clustered_vs_noise_numeric.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv', index=False)\n",
    "permutation_tests.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv', index=False)\n",
    "print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv')\n",
    "print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Source enrichment: are curated/merged/OSM rows clustered at different rates?\n",
    "source_table = pd.crosstab(dc['source'], dc['is_clustered'])\n",
    "source_table.columns = ['noise_or_isolated', 'clustered'] if source_table.shape[1] == 2 else source_table.columns\n",
    "source_rates = source_table.copy()\n",
    "source_rates['total'] = source_rates.sum(axis=1)\n",
    "source_rates['clustered_share'] = source_rates.get('clustered', 0) / source_rates['total']\n",
    "source_rates = source_rates.sort_values('clustered_share', ascending=False)\n",
    "\n",
    "test = chi_square_independence(source_table)\n",
    "perm_test = categorical_permutation_chi_square(dc, 'source', 'is_clustered')\n",
    "source_test = pd.DataFrame([{'test': 'source_by_clustered', **test, **perm_test}])\n",
    "\n",
    "display(source_rates)\n",
    "display(source_test)\n",
    "\n",
    "source_rates.reset_index().to_csv(stats_dir / 'master_data_center_source_cluster_enrichment.csv', index=False)\n",
    "source_test.to_csv(stats_dir / 'master_data_center_source_cluster_chi_square.csv', index=False)\n",
    "print('Wrote:', stats_dir / 'master_data_center_source_cluster_enrichment.csv')\n",
    "print('Wrote:', stats_dir / 'master_data_center_source_cluster_chi_square.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# State-level clustering profile.\n",
    "state_cluster_stats = (\n",
    "    dc.groupby('state')\n",
    "      .agg(\n",
    "          points=('master_id', 'count'),\n",
    "          clustered_points=('is_clustered', 'sum'),\n",
    "          clusters=('cluster_id', lambda s: s[s.ne(-1)].nunique()),\n",
    "          median_nearest_neighbor_km=('nearest_neighbor_km', 'median'),\n",
    "          mean_nearest_neighbor_km=('nearest_neighbor_km', 'mean'),\n",
    "          known_power_rows=('power_mw', lambda s: s.notna().sum()),\n",
    "          total_power_mw_known=('power_mw', 'sum'),\n",
    "          known_area_rows=('area_sqft', lambda s: s.notna().sum()),\n",
    "          total_area_sqft_known=('area_sqft', 'sum'),\n",
    "      )\n",
    "      .reset_index()\n",
    ")\n",
    "state_cluster_stats['clustered_share'] = state_cluster_stats['clustered_points'] / state_cluster_stats['points']\n",
    "state_cluster_stats = state_cluster_stats.sort_values(['points', 'clustered_share'], ascending=[False, False])\n",
    "display(state_cluster_stats.head(30))\n",
    "\n",
    "state_cluster_stats.to_csv(stats_dir / 'master_data_center_state_cluster_statistics.csv', index=False)\n",
    "print('Wrote:', stats_dir / 'master_data_center_state_cluster_statistics.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional plots for the statistical summaries.\n",
    "if HAS_MATPLOTLIB:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "    cluster_sizes.head(25).sort_values().plot(kind='barh', ax=axes[0])\n",
    "    axes[0].set_title('Largest DBSCAN Clusters')\n",
    "    axes[0].set_xlabel('data centers')\n",
    "    axes[0].set_ylabel('cluster_id')\n",
    "\n",
    "    dc.boxplot(column='nearest_neighbor_km', by='is_clustered', ax=axes[1], showfliers=False)\n",
    "    axes[1].set_title('Nearest-Neighbor Distance by Cluster Status')\n",
    "    axes[1].set_xlabel('is_clustered')\n",
    "    axes[1].set_ylabel('km')\n",
    "    fig.suptitle('')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "    top_states = state_cluster_stats.head(20).sort_values('points')\n",
    "    fig, ax = plt.subplots(figsize=(9, 7))\n",
    "    ax.barh(top_states['state'], top_states['points'], label='all points', color='#9ca3af')\n",
    "    ax.barh(top_states['state'], top_states['clustered_points'], label='clustered points', color='#2563eb')\n",
    "    ax.set_title('Clustered Data Centers by State')\n",
    "    ax.set_xlabel('data centers')\n",
    "    ax.legend()\n",
    "    plt.show()\n",
    "else:\n",
    "    print('matplotlib is not installed; skipping statistical plots.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27",
   "metadata": {},
   "source": [
    "## Optional Interactive Map\n",
    "\n",
    "If `folium` is installed, this cell renders an interactive map. Large datasets can be slow, so the popup text is intentionally compact."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    import folium\n",
    "    from html import escape\n",
    "\n",
    "    if 'dc' not in globals():\n",
    "        dc = pd.read_csv('output/master_data_center_spatial_cluster_points.csv')\n",
    "    if 'clusters' not in globals():\n",
    "        clusters = pd.read_csv('output/master_data_center_spatial_cluster_summary.csv')\n",
    "    if 'n_clusters' not in globals():\n",
    "        n_clusters = dc.loc[dc['cluster_id'].ne(-1), 'cluster_id'].nunique()\n",
    "    if 'is_noise' not in dc.columns:\n",
    "        dc['is_noise'] = dc['cluster_id'].eq(-1)\n",
    "\n",
    "    m = folium.Map(location=[39, -98], zoom_start=4, tiles='CartoDB positron')\n",
    "\n",
    "    cluster_info = {}\n",
    "    if not clusters.empty:\n",
    "        ranked_clusters = clusters.reset_index(drop=True).copy()\n",
    "        ranked_clusters['cluster_rank'] = ranked_clusters.index + 1\n",
    "        cluster_info = ranked_clusters.set_index('cluster_id').to_dict('index')\n",
    "\n",
    "    cluster_colors = [\n",
    "        '#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2',\n",
    "        '#be123c', '#4f46e5', '#65a30d', '#c026d3', '#0f766e', '#b45309',\n",
    "    ]\n",
    "\n",
    "    def clean_popup_value(value):\n",
    "        if pd.isna(value):\n",
    "            return ''\n",
    "        return escape(str(value))\n",
    "\n",
    "    for row in dc.itertuples(index=False):\n",
    "        info = cluster_info.get(row.cluster_id, {})\n",
    "        if row.cluster_id == -1:\n",
    "            color = '#9ca3af'\n",
    "            radius = 3\n",
    "            cluster_label = 'Noise / isolated'\n",
    "            cluster_size = '1'\n",
    "            cluster_rank = ''\n",
    "        else:\n",
    "            rank = int(info.get('cluster_rank', row.cluster_id + 1))\n",
    "            color = cluster_colors[(rank - 1) % len(cluster_colors)]\n",
    "            radius = 5\n",
    "            cluster_label = f'Cluster ID {row.cluster_id}'\n",
    "            cluster_size = f\"{int(info.get('point_count', 0)):,}\"\n",
    "            cluster_rank = f\"Rank {rank} of {n_clusters} by size\"\n",
    "\n",
    "        popup_html = f'''\n",
    "            <div style=\"font-family: system-ui, sans-serif; min-width: 260px;\">\n",
    "              <strong>{clean_popup_value(row.name) or clean_popup_value(row.master_id)}</strong><br>\n",
    "              {clean_popup_value(row.city)}, {clean_popup_value(row.state)}<br>\n",
    "              <hr style=\"margin: 6px 0;\">\n",
    "              <strong>{cluster_label}</strong><br>\n",
    "              {cluster_rank}<br>\n",
    "              Cluster size: {cluster_size} data center(s)<br>\n",
    "              Source: {clean_popup_value(row.source)}<br>\n",
    "              Operator: {clean_popup_value(row.operator)}<br>\n",
    "              Nearest neighbor: {row.nearest_neighbor_km:.2f} km\n",
    "            </div>\n",
    "        '''\n",
    "        popup = folium.Popup(popup_html, max_width=360)\n",
    "\n",
    "        folium.CircleMarker(\n",
    "            location=[row.latitude, row.longitude],\n",
    "            radius=radius,\n",
    "            color=color,\n",
    "            fill=True,\n",
    "            fill_opacity=0.75,\n",
    "            weight=1,\n",
    "            popup=popup,\n",
    "            tooltip=f'{cluster_label}; size={cluster_size}',\n",
    "        ).add_to(m)\n",
    "    display(m)\n",
    "except ImportError:\n",
    "    print('folium is not installed; skipping interactive map.')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29",
   "metadata": {},
   "source": [
    "## Export Results\n",
    "\n",
    "The first cell writes local CSVs. The second cell can write a compact cluster-label table back to PostGIS. Keep `WRITE_BACK_TO_DB = False` until you are happy with the selected parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30",
   "metadata": {},
   "outputs": [],
   "source": [
    "points_out = Path('output/master_data_center_spatial_cluster_points.csv')\n",
    "clusters_out = Path('output/master_data_center_spatial_cluster_summary.csv')\n",
    "\n",
    "dc.to_csv(points_out, index=False)\n",
    "clusters.to_csv(clusters_out, index=False)\n",
    "\n",
    "print('Wrote:', points_out)\n",
    "print('Wrote:', clusters_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {},
   "outputs": [],
   "source": [
    "WRITE_BACK_TO_DB = False\n",
    "\n",
    "if WRITE_BACK_TO_DB:\n",
    "    rows = [\n",
    "        (\n",
    "            r.master_id,\n",
    "            int(r.cluster_id),\n",
    "            bool(r.is_noise),\n",
    "            float(EPS_KM),\n",
    "            int(MIN_SAMPLES),\n",
    "            float(r.nearest_neighbor_km) if pd.notna(r.nearest_neighbor_km) else None,\n",
    "        )\n",
    "        for r in dc.itertuples(index=False)\n",
    "    ]\n",
    "\n",
    "    with get_conn() as conn:\n",
    "        with conn.cursor() as cur:\n",
    "            cur.execute(sql.SQL('drop table if exists {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
    "            cur.execute(sql.SQL('''\n",
    "                create table {} (\n",
    "                    master_id text primary key references public.master_data_centers(master_id),\n",
    "                    cluster_id integer not null,\n",
    "                    is_noise boolean not null,\n",
    "                    eps_km double precision not null,\n",
    "                    min_samples integer not null,\n",
    "                    nearest_neighbor_km double precision,\n",
    "                    created_at timestamptz not null default now()\n",
    "                )\n",
    "            ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
    "            execute_values(\n",
    "                cur,\n",
    "                sql.SQL('''\n",
    "                    insert into {} (\n",
    "                        master_id, cluster_id, is_noise, eps_km, min_samples, nearest_neighbor_km\n",
    "                    ) values %s\n",
    "                ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)).as_string(conn),\n",
    "                rows,\n",
    "                page_size=5000,\n",
    "            )\n",
    "            cur.execute(sql.SQL('create index on {} (cluster_id)').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
    "            cur.execute(sql.SQL('analyze {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
    "    print(f'Wrote {len(rows):,} rows to {CLUSTER_OUTPUT_TABLE}')\n",
    "else:\n",
    "    print('WRITE_BACK_TO_DB is False; no database table was modified.')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}