Files
data-centers/spatial_clustering_master_data_centers.ipynb

1139 lines
45 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "0",
"metadata": {},
"source": [
"# Spatial Clustering of Master Data Centers\n",
"\n",
"This notebook analyzes spatial clustering in `public.master_data_centers`, the merged curated + OSM table built by `build_master_data_centers.py`.\n",
"\n",
"Main workflow:\n",
"- Load data-center points from PostGIS.\n",
"- Diagnose nearest-neighbor distances and spatial concentration.\n",
"- Run DBSCAN with haversine distances so distances are in kilometers rather than degrees.\n",
"- Sweep clustering parameters to check sensitivity.\n",
"- Summarize clusters by geography, source, operator, and scale.\n",
"- Optionally write cluster labels back to PostGIS for downstream mapping or joins.\n",
"\n",
"Expected environment variables: `PGWEB_HOST`, `PGWEB_PORT`, `PGWEB_USER`, `PGWEB_PASSWORD`. Optional: `PGDATABASE`, which defaults to `data_centers`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')\n",
"Path(os.environ['MPLCONFIGDIR']).mkdir(parents=True, exist_ok=True)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import psycopg2\n",
"from psycopg2 import sql\n",
"from psycopg2.extras import execute_values\n",
"\n",
"try:\n",
" import matplotlib.pyplot as plt\n",
" HAS_MATPLOTLIB = True\n",
"except ImportError:\n",
" plt = None\n",
" HAS_MATPLOTLIB = False\n",
"\n",
"pd.set_option('display.max_columns', 80)\n",
"pd.set_option('display.max_rows', 120)\n",
"if HAS_MATPLOTLIB:\n",
" plt.rcParams['figure.figsize'] = (10, 6)\n",
"\n",
"EARTH_RADIUS_KM = 6371.0088\n",
"\n",
"print('pandas:', pd.__version__)\n",
"print('numpy:', np.__version__)\n",
"print('matplotlib:', 'ok' if HAS_MATPLOTLIB else 'not installed; plot cells will be skipped')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {},
"outputs": [],
"source": [
"def load_env_file(env_path: str = '.env') -> None:\n",
" p = Path(env_path)\n",
" if not p.exists():\n",
" print(f'No {env_path} file found in {Path.cwd()}')\n",
" return\n",
"\n",
" loaded = 0\n",
" for raw_line in p.read_text(encoding='utf-8').splitlines():\n",
" line = raw_line.strip()\n",
" if not line or line.startswith('#') or '=' not in line:\n",
" continue\n",
" key, value = line.split('=', 1)\n",
" key = key.strip()\n",
" value = value.strip().strip('\"').strip(\"'\")\n",
" if key and key not in os.environ:\n",
" os.environ[key] = value\n",
" loaded += 1\n",
" print(f'Loaded {loaded} env var(s) from {env_path}')\n",
"\n",
"\n",
"def require_env(keys):\n",
" missing = [k for k in keys if not os.getenv(k)]\n",
" if missing:\n",
" raise EnvironmentError(\n",
" 'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n",
" '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n",
" )\n",
"\n",
"\n",
"load_env_file('.env')\n",
"print('PGWEB_HOST:', os.getenv('PGWEB_HOST', '<not set>'))\n",
"print('PGWEB_PORT:', os.getenv('PGWEB_PORT', '<not set>'))\n",
"print('PGWEB_USER:', os.getenv('PGWEB_USER', '<not set>'))\n",
"\n",
"# Connection setup: mirrors postgis_table_loader.ipynb and the existing scripts in this repository.\n",
"required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n",
"require_env(required_keys)\n",
"\n",
"DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n",
"MASTER_TABLE = 'public.master_data_centers'\n",
"CLUSTER_OUTPUT_TABLE = 'public.master_data_center_spatial_clusters'\n",
"\n",
"\n",
"def get_conn():\n",
" return psycopg2.connect(\n",
" host=os.environ['PGWEB_HOST'],\n",
" port=os.environ['PGWEB_PORT'],\n",
" user=os.environ['PGWEB_USER'],\n",
" password=os.environ['PGWEB_PASSWORD'],\n",
" dbname=\"data_centers\",\n",
" )\n",
"\n",
"\n",
"with get_conn() as conn:\n",
" with conn.cursor() as cur:\n",
" cur.execute('select current_database(), current_user, version()')\n",
" db, usr, ver = cur.fetchone()\n",
" print('Connected to DB:', db)\n",
" print('As user:', usr)\n",
" print('Postgres:', ver.split(',')[0])\n",
" cur.execute('create extension if not exists postgis')\n",
" print('PostGIS extension is available')\n",
" cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n",
" if cur.fetchone()[0] is None:\n",
" raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n",
" cur.execute('select count(*) from public.master_data_centers')\n",
" print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')"
]
},
{
"cell_type": "markdown",
"id": "3",
"metadata": {},
"source": [
"## Analysis Parameters\n",
"\n",
"`EPS_KM` is the neighborhood radius for DBSCAN. `MIN_SAMPLES` is the minimum number of data centers needed in that radius to form a dense cluster. Treat these as research choices: the parameter sweep below helps show how stable the cluster structure is."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4",
"metadata": {},
"outputs": [],
"source": [
"EPS_KM = 25\n",
"MIN_SAMPLES = 5\n",
"PROJECTION_SRID = 5070 # NAD83 / Conus Albers; meters, good for contiguous US clustering.\n",
"\n",
"# Lower 48-ish bounding box. Set to False if you want Alaska, Hawaii, PR, etc. included.\n",
"FILTER_CONTIGUOUS_US = True\n",
"\n",
"SWEEP_EPS_KM = [2, 5, 10, 15, 25, 50, 100]\n",
"SWEEP_MIN_SAMPLES = [3, 5, 10]\n",
"\n",
"print(f'DBSCAN eps={EPS_KM} km, min_samples={MIN_SAMPLES}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"def master_points_where(alias: str = '') -> str:\n",
" prefix = f'{alias}.' if alias else ''\n",
" clauses = [\n",
" f'{prefix}longitude is not null',\n",
" f'{prefix}latitude is not null',\n",
" f'{prefix}geom is not null',\n",
" ]\n",
" if FILTER_CONTIGUOUS_US:\n",
" clauses.extend([\n",
" f'{prefix}longitude between -125 and -66',\n",
" f'{prefix}latitude between 24 and 50',\n",
" ])\n",
" return ' and '.join(clauses)\n",
"\n",
"\n",
"sql_query = f\"\"\"\n",
"select\n",
" master_id,\n",
" source,\n",
" curated_id,\n",
" osm_id,\n",
" name,\n",
" operator,\n",
" city,\n",
" state,\n",
" country,\n",
" power_mw,\n",
" area_sqft,\n",
" has_bare_metal,\n",
" has_iaas,\n",
" has_internet_exchange,\n",
" has_colocation,\n",
" longitude,\n",
" latitude,\n",
" ST_AsText(geom) as geom_wkt\n",
"from public.master_data_centers\n",
"where {master_points_where()}\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" dc = pd.read_sql_query(sql_query, conn)\n",
"\n",
"dc['longitude'] = pd.to_numeric(dc['longitude'], errors='coerce')\n",
"dc['latitude'] = pd.to_numeric(dc['latitude'], errors='coerce')\n",
"dc = dc.dropna(subset=['longitude', 'latitude']).copy()\n",
"\n",
"dc['state'] = dc['state'].fillna('UNK').replace('', 'UNK')\n",
"dc['operator'] = dc['operator'].fillna('Unknown').replace('', 'Unknown')\n",
"dc['city'] = dc['city'].fillna('Unknown').replace('', 'Unknown')\n",
"\n",
"print(f'Loaded {len(dc):,} points for clustering')\n",
"display(dc.head())\n",
"display(dc['source'].value_counts(dropna=False).rename_axis('source').reset_index(name='rows'))"
]
},
{
"cell_type": "markdown",
"id": "6",
"metadata": {},
"source": [
"## Baseline Geography\n",
"\n",
"Before clustering, inspect where the observations are and how concentrated they already are by state/city/source."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7",
"metadata": {},
"outputs": [],
"source": [
"state_counts = (\n",
" dc.groupby('state', dropna=False)\n",
" .size()\n",
" .sort_values(ascending=False)\n",
" .rename('data_centers')\n",
" .reset_index()\n",
")\n",
"display(state_counts.head(25))\n",
"\n",
"city_counts = (\n",
" dc.groupby(['state', 'city'], dropna=False)\n",
" .size()\n",
" .sort_values(ascending=False)\n",
" .rename('data_centers')\n",
" .reset_index()\n",
")\n",
"display(city_counts.head(25))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8",
"metadata": {},
"outputs": [],
"source": [
"if HAS_MATPLOTLIB:\n",
" fig, ax = plt.subplots(figsize=(11, 7))\n",
" for source, group in dc.groupby('source'):\n",
" ax.scatter(group['longitude'], group['latitude'], s=12, alpha=0.65, label=source)\n",
" ax.set_title('Master Data Centers by Source')\n",
" ax.set_xlabel('Longitude')\n",
" ax.set_ylabel('Latitude')\n",
" ax.legend(title='source')\n",
" ax.grid(True, linewidth=0.3, alpha=0.4)\n",
" plt.show()\n",
"else:\n",
" print('matplotlib is not installed; skipping static source scatter plot.')"
]
},
{
"cell_type": "markdown",
"id": "9",
"metadata": {},
"source": [
"## Nearest-Neighbor Distance Diagnostics\n",
"\n",
"Nearest-neighbor distances are useful for choosing DBSCAN parameters. If many points have a nearest neighbor inside 5-25 km, those radii will detect metro-scale clusters; larger radii begin to merge nearby metros and corridors. Distances are computed in PostGIS using geography meters."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10",
"metadata": {},
"outputs": [],
"source": [
"nearest_neighbor_sql = f\"\"\"\n",
"with points as (\n",
" select\n",
" master_id,\n",
" geom,\n",
" ST_Transform(geom, %s) as geom_m\n",
" from public.master_data_centers\n",
" where {master_points_where()}\n",
")\n",
"select\n",
" p.master_id,\n",
" q.master_id as nearest_neighbor_id,\n",
" ST_Distance(p.geom::geography, q.geom::geography) / 1000.0 as nearest_neighbor_km\n",
"from points p\n",
"left join lateral (\n",
" select q.master_id, q.geom\n",
" from points q\n",
" where q.master_id <> p.master_id\n",
" order by p.geom_m <-> q.geom_m\n",
" limit 1\n",
") q on true\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" nn = pd.read_sql_query(nearest_neighbor_sql, conn, params=(PROJECTION_SRID,))\n",
"\n",
"dc = dc.drop(columns=[c for c in ['nearest_neighbor_id', 'nearest_neighbor_km'] if c in dc.columns])\n",
"dc = dc.merge(nn, on='master_id', how='left')\n",
"\n",
"display(dc['nearest_neighbor_km'].describe(percentiles=[.05, .10, .25, .50, .75, .90, .95, .99]).to_frame())\n",
"\n",
"thresholds = [1, 2, 5, 10, 25, 50, 100]\n",
"nn_thresholds = pd.DataFrame({\n",
" 'within_km': thresholds,\n",
" 'share_of_points': [(dc['nearest_neighbor_km'] <= km).mean() for km in thresholds],\n",
" 'point_count': [(dc['nearest_neighbor_km'] <= km).sum() for km in thresholds],\n",
"})\n",
"display(nn_thresholds)\n",
"\n",
"if HAS_MATPLOTLIB:\n",
" fig, ax = plt.subplots(figsize=(9, 5))\n",
" dc['nearest_neighbor_km'].clip(upper=200).hist(bins=50, ax=ax)\n",
" ax.axvline(EPS_KM, color='crimson', linewidth=2, label=f'EPS_KM={EPS_KM}')\n",
" ax.set_title('Nearest-Neighbor Distance Distribution (clipped at 200 km)')\n",
" ax.set_xlabel('Nearest neighbor distance, km')\n",
" ax.set_ylabel('Data centers')\n",
" ax.legend()\n",
" plt.show()\n",
"else:\n",
" print('matplotlib is not installed; skipping nearest-neighbor histogram.')"
]
},
{
"cell_type": "markdown",
"id": "11",
"metadata": {},
"source": [
"## DBSCAN Clustering\n",
"\n",
"DBSCAN is a good first-pass method here because it finds irregularly shaped dense groups and labels isolated points as noise. The implementation below uses PostGIS `ST_ClusterDBSCAN` on `ST_Transform(geom, 5070)`, so `eps` is specified in meters after projecting to a continental-US equal-area CRS."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12",
"metadata": {},
"outputs": [],
"source": [
"def load_dbscan_labels(eps_km: float, min_samples: int) -> pd.DataFrame:\n",
" cluster_sql = f\"\"\"\n",
" with points as (\n",
" select\n",
" master_id,\n",
" ST_Transform(geom, %s) as geom_m\n",
" from public.master_data_centers\n",
" where {master_points_where()}\n",
" ), clustered as (\n",
" select\n",
" master_id,\n",
" ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as raw_cluster_id\n",
" from points\n",
" )\n",
" select\n",
" master_id,\n",
" coalesce(raw_cluster_id, -1) as cluster_id\n",
" from clustered\n",
" \"\"\"\n",
" with get_conn() as conn:\n",
" labels = pd.read_sql_query(\n",
" cluster_sql,\n",
" conn,\n",
" params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n",
" )\n",
" labels['cluster_id'] = labels['cluster_id'].astype(int)\n",
" return labels\n",
"\n",
"\n",
"labels = load_dbscan_labels(EPS_KM, MIN_SAMPLES)\n",
"dc = dc.drop(columns=[c for c in ['cluster_id', 'is_noise'] if c in dc.columns])\n",
"dc = dc.merge(labels, on='master_id', how='left')\n",
"dc['cluster_id'] = dc['cluster_id'].fillna(-1).astype(int)\n",
"dc['is_noise'] = dc['cluster_id'].eq(-1)\n",
"\n",
"n_clusters = dc.loc[~dc['is_noise'], 'cluster_id'].nunique()\n",
"n_noise = int(dc['is_noise'].sum())\n",
"print(f'Clusters found: {n_clusters:,}')\n",
"print(f'Noise / isolated points: {n_noise:,} ({n_noise / len(dc):.1%})')\n",
"display(dc['cluster_id'].value_counts().head(20).rename_axis('cluster_id').reset_index(name='points'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13",
"metadata": {},
"outputs": [],
"source": [
"def haversine_km(lat1, lon1, lat2, lon2):\n",
" lat1 = np.radians(lat1)\n",
" lon1 = np.radians(lon1)\n",
" lat2 = np.radians(lat2)\n",
" lon2 = np.radians(lon2)\n",
" dlat = lat2 - lat1\n",
" dlon = lon2 - lon1\n",
" a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2\n",
" return 2 * EARTH_RADIUS_KM * np.arcsin(np.sqrt(a))\n",
"\n",
"\n",
"def mode_list(values, n=5):\n",
" s = pd.Series(values).dropna().astype(str)\n",
" s = s[s.ne('')]\n",
" if s.empty:\n",
" return ''\n",
" return '; '.join(f'{k} ({v})' for k, v in s.value_counts().head(n).items())\n",
"\n",
"\n",
"cluster_summary_columns = [\n",
" 'cluster_id', 'point_count', 'centroid_latitude', 'centroid_longitude',\n",
" 'radius_km_p50', 'radius_km_p90', 'radius_km_max', 'states', 'cities',\n",
" 'sources', 'operators', 'total_power_mw_known', 'known_power_rows',\n",
" 'total_area_sqft_known', 'known_area_rows',\n",
"]\n",
"\n",
"cluster_rows = []\n",
"for cid, group in dc.loc[~dc['is_noise']].groupby('cluster_id'):\n",
" centroid_lat = group['latitude'].mean()\n",
" centroid_lon = group['longitude'].mean()\n",
" distances = haversine_km(group['latitude'], group['longitude'], centroid_lat, centroid_lon)\n",
" cluster_rows.append({\n",
" 'cluster_id': int(cid),\n",
" 'point_count': len(group),\n",
" 'centroid_latitude': centroid_lat,\n",
" 'centroid_longitude': centroid_lon,\n",
" 'radius_km_p50': float(np.percentile(distances, 50)),\n",
" 'radius_km_p90': float(np.percentile(distances, 90)),\n",
" 'radius_km_max': float(np.max(distances)),\n",
" 'states': mode_list(group['state'], n=8),\n",
" 'cities': mode_list(group['city'], n=8),\n",
" 'sources': mode_list(group['source'], n=4),\n",
" 'operators': mode_list(group['operator'], n=8),\n",
" 'total_power_mw_known': group['power_mw'].dropna().sum(),\n",
" 'known_power_rows': group['power_mw'].notna().sum(),\n",
" 'total_area_sqft_known': group['area_sqft'].dropna().sum(),\n",
" 'known_area_rows': group['area_sqft'].notna().sum(),\n",
" })\n",
"\n",
"if cluster_rows:\n",
" clusters = pd.DataFrame(cluster_rows).sort_values(['point_count', 'radius_km_p90'], ascending=[False, True])\n",
"else:\n",
" clusters = pd.DataFrame(columns=cluster_summary_columns)\n",
"\n",
"display(clusters.head(30))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14",
"metadata": {},
"outputs": [],
"source": [
"if HAS_MATPLOTLIB:\n",
" fig, ax = plt.subplots(figsize=(12, 8))\n",
" noise = dc['is_noise']\n",
" ax.scatter(dc.loc[noise, 'longitude'], dc.loc[noise, 'latitude'], s=8, c='lightgray', alpha=0.55, label='noise')\n",
"\n",
" clustered = dc.loc[~noise].copy()\n",
" scatter = ax.scatter(\n",
" clustered['longitude'],\n",
" clustered['latitude'],\n",
" c=clustered['cluster_id'],\n",
" cmap='tab20',\n",
" s=18,\n",
" alpha=0.85,\n",
" label='clustered',\n",
" )\n",
" ax.set_title(f'DBSCAN Data-Center Clusters (eps={EPS_KM} km, min_samples={MIN_SAMPLES})')\n",
" ax.set_xlabel('Longitude')\n",
" ax.set_ylabel('Latitude')\n",
" ax.grid(True, linewidth=0.3, alpha=0.4)\n",
" plt.colorbar(scatter, ax=ax, label='cluster_id')\n",
" plt.show()\n",
"else:\n",
" print('matplotlib is not installed; skipping static cluster scatter plot.')"
]
},
{
"cell_type": "markdown",
"id": "15",
"metadata": {},
"source": [
"## Parameter Sensitivity\n",
"\n",
"The sweep below asks whether the cluster story depends heavily on a single `eps` or `min_samples` choice."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16",
"metadata": {},
"outputs": [],
"source": [
"def dbscan_summary(eps_km: float, min_samples: int) -> dict:\n",
" summary_sql = f\"\"\"\n",
" with points as (\n",
" select master_id, ST_Transform(geom, %s) as geom_m\n",
" from public.master_data_centers\n",
" where {master_points_where()}\n",
" ), labels as (\n",
" select ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as cid\n",
" from points\n",
" ), cluster_sizes as (\n",
" select cid, count(*)::integer as cluster_size\n",
" from labels\n",
" where cid is not null\n",
" group by cid\n",
" )\n",
" select\n",
" (select count(*)::integer from labels) as points,\n",
" (select count(distinct cid)::integer from labels where cid is not null) as clusters,\n",
" (select count(*)::integer from labels where cid is not null) as clustered_points,\n",
" (select count(*)::integer from labels where cid is null) as noise_points,\n",
" coalesce((select max(cluster_size)::integer from cluster_sizes), 0) as largest_cluster,\n",
" coalesce((select percentile_cont(0.5) within group (order by cluster_size) from cluster_sizes), 0) as median_cluster_size\n",
" \"\"\"\n",
" with get_conn() as conn:\n",
" row = pd.read_sql_query(\n",
" summary_sql,\n",
" conn,\n",
" params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n",
" ).iloc[0].to_dict()\n",
" row['eps_km'] = eps_km\n",
" row['min_samples'] = min_samples\n",
" row['clustered_share'] = row['clustered_points'] / row['points'] if row['points'] else 0\n",
" return row\n",
"\n",
"\n",
"sweep_rows = []\n",
"for eps in SWEEP_EPS_KM:\n",
" for min_samples in SWEEP_MIN_SAMPLES:\n",
" sweep_rows.append(dbscan_summary(eps, min_samples))\n",
"\n",
"sweep = pd.DataFrame(sweep_rows)\n",
"display(sweep)\n",
"\n",
"if HAS_MATPLOTLIB:\n",
" fig, axes = plt.subplots(1, 2, figsize=(13, 5), sharex=True)\n",
" for min_samples, group in sweep.groupby('min_samples'):\n",
" axes[0].plot(group['eps_km'], group['clusters'], marker='o', label=f'min={min_samples}')\n",
" axes[1].plot(group['eps_km'], group['clustered_share'], marker='o', label=f'min={min_samples}')\n",
" axes[0].set_title('Cluster Count by eps')\n",
" axes[0].set_ylabel('clusters')\n",
" axes[1].set_title('Share of Points Clustered by eps')\n",
" axes[1].set_ylabel('clustered share')\n",
" for ax in axes:\n",
" ax.set_xlabel('eps, km')\n",
" ax.grid(True, linewidth=0.3, alpha=0.4)\n",
" ax.legend()\n",
" plt.show()\n",
"else:\n",
" print('matplotlib is not installed; skipping parameter-sweep line charts.')"
]
},
{
"cell_type": "markdown",
"id": "17",
"metadata": {},
"source": [
"## Cluster Profiles\n",
"\n",
"Use these tables to interpret clusters as local agglomerations, metro regions, corridors, or source-specific artifacts."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18",
"metadata": {},
"outputs": [],
"source": [
"if clusters.empty:\n",
" cluster_source = pd.DataFrame(columns=['cluster_id', 'point_count'])\n",
"else:\n",
" cluster_source = (\n",
" dc.loc[~dc['is_noise']]\n",
" .pivot_table(index='cluster_id', columns='source', values='master_id', aggfunc='count', fill_value=0)\n",
" .reset_index()\n",
" )\n",
" cluster_source = clusters[['cluster_id', 'point_count']].merge(cluster_source, on='cluster_id', how='left')\n",
"display(cluster_source.head(30))\n",
"\n",
"cluster_state = (\n",
" dc.loc[~dc['is_noise']]\n",
" .groupby(['cluster_id', 'state'])\n",
" .size()\n",
" .rename('points')\n",
" .reset_index()\n",
" .sort_values(['cluster_id', 'points'], ascending=[True, False])\n",
")\n",
"display(cluster_state.head(80))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19",
"metadata": {},
"outputs": [],
"source": [
"# Inspect the largest cluster in detail. Change CLUSTER_TO_INSPECT as needed.\n",
"CLUSTER_TO_INSPECT = int(clusters.iloc[0]['cluster_id']) if len(clusters) else None\n",
"\n",
"if CLUSTER_TO_INSPECT is not None:\n",
" cols = [\n",
" 'master_id', 'name', 'operator', 'city', 'state', 'source',\n",
" 'longitude', 'latitude', 'power_mw', 'area_sqft', 'nearest_neighbor_km'\n",
" ]\n",
" detail = (\n",
" dc.loc[dc['cluster_id'].eq(CLUSTER_TO_INSPECT), cols]\n",
" .sort_values(['state', 'city', 'operator', 'name'], na_position='last')\n",
" )\n",
" print(f'Cluster {CLUSTER_TO_INSPECT}: {len(detail):,} points')\n",
" display(detail)\n",
"else:\n",
" print('No clusters found with current parameters.')"
]
},
{
"cell_type": "markdown",
"id": "20",
"metadata": {},
"source": [
"## Statistical Analysis\n",
"\n",
"This section treats the DBSCAN result as an analytical object: how concentrated are data centers across clusters, how different are clustered points from isolated points, and are some sources/states disproportionately represented in dense clusters?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21",
"metadata": {},
"outputs": [],
"source": [
"# Helper functions for concentration metrics and lightweight statistical tests.\n",
"def gini(values):\n",
" x = pd.Series(values).dropna().astype(float)\n",
" x = x[x >= 0].sort_values().to_numpy()\n",
" if len(x) == 0 or x.sum() == 0:\n",
" return np.nan\n",
" n = len(x)\n",
" index = np.arange(1, n + 1)\n",
" return float((2 * np.sum(index * x)) / (n * np.sum(x)) - (n + 1) / n)\n",
"\n",
"\n",
"def hhi_from_counts(values):\n",
" x = pd.Series(values).dropna().astype(float)\n",
" total = x.sum()\n",
" if total == 0:\n",
" return np.nan\n",
" shares = x / total\n",
" return float(np.sum(shares ** 2))\n",
"\n",
"\n",
"def normalized_entropy_from_counts(values):\n",
" x = pd.Series(values).dropna().astype(float)\n",
" x = x[x > 0]\n",
" if len(x) <= 1:\n",
" return 0.0 if len(x) == 1 else np.nan\n",
" shares = x / x.sum()\n",
" return float(-(shares * np.log(shares)).sum() / np.log(len(shares)))\n",
"\n",
"\n",
"def numeric_summary_by_group(frame, group_col, value_cols):\n",
" rows = []\n",
" for col in value_cols:\n",
" for group, vals in frame.groupby(group_col)[col]:\n",
" vals = pd.to_numeric(vals, errors='coerce').dropna()\n",
" rows.append({\n",
" 'variable': col,\n",
" group_col: group,\n",
" 'n': int(vals.size),\n",
" 'mean': float(vals.mean()) if vals.size else np.nan,\n",
" 'median': float(vals.median()) if vals.size else np.nan,\n",
" 'p25': float(vals.quantile(0.25)) if vals.size else np.nan,\n",
" 'p75': float(vals.quantile(0.75)) if vals.size else np.nan,\n",
" 'p90': float(vals.quantile(0.90)) if vals.size else np.nan,\n",
" })\n",
" return pd.DataFrame(rows)\n",
"\n",
"\n",
"def permutation_mean_diff(frame, value_col, group_col='is_clustered', reps=5000, seed=42):\n",
" work = frame[[value_col, group_col]].copy()\n",
" work[value_col] = pd.to_numeric(work[value_col], errors='coerce')\n",
" work = work.dropna()\n",
" if work[group_col].nunique() != 2:\n",
" return {'variable': value_col, 'observed_mean_diff': np.nan, 'p_two_sided': np.nan, 'reps': reps, 'n': len(work)}\n",
"\n",
" y = work[value_col].to_numpy(dtype=float)\n",
" g = work[group_col].to_numpy(dtype=bool)\n",
" observed = y[g].mean() - y[~g].mean()\n",
" rng = np.random.default_rng(seed)\n",
" more_extreme = 0\n",
" for _ in range(reps):\n",
" gp = rng.permutation(g)\n",
" diff = y[gp].mean() - y[~gp].mean()\n",
" if abs(diff) >= abs(observed):\n",
" more_extreme += 1\n",
" p = (more_extreme + 1) / (reps + 1)\n",
" return {'variable': value_col, 'observed_mean_diff': float(observed), 'p_two_sided': float(p), 'reps': reps, 'n': len(work)}\n",
"\n",
"\n",
"def chi_square_independence(table):\n",
" observed = table.to_numpy(dtype=float)\n",
" total = observed.sum()\n",
" if observed.size == 0 or total == 0:\n",
" return {'chi2': np.nan, 'dof': np.nan, 'p_value': np.nan, 'cramers_v': np.nan}\n",
" row_totals = observed.sum(axis=1, keepdims=True)\n",
" col_totals = observed.sum(axis=0, keepdims=True)\n",
" expected = row_totals @ col_totals / total\n",
" mask = expected > 0\n",
" chi2 = float(((observed - expected) ** 2 / np.where(mask, expected, 1)).sum())\n",
" dof = int((observed.shape[0] - 1) * (observed.shape[1] - 1))\n",
" denom = total * max(1, min(observed.shape[0] - 1, observed.shape[1] - 1))\n",
" cramers_v = float(np.sqrt(chi2 / denom)) if denom else np.nan\n",
"\n",
" p_value = np.nan\n",
" try:\n",
" from scipy.stats import chi2 as scipy_chi2\n",
" p_value = float(scipy_chi2.sf(chi2, dof))\n",
" except ImportError:\n",
" pass\n",
" return {'chi2': chi2, 'dof': dof, 'p_value': p_value, 'cramers_v': cramers_v}\n",
"\n",
"\n",
"def categorical_permutation_chi_square(frame, category_col, group_col='is_clustered', reps=5000, seed=42):\n",
" work = frame[[category_col, group_col]].dropna().copy()\n",
" if work.empty or work[category_col].nunique() < 2 or work[group_col].nunique() < 2:\n",
" return {'permutation_p_value': np.nan, 'permutation_reps': reps}\n",
"\n",
" categories = pd.Index(sorted(work[category_col].astype(str).unique()))\n",
" groups = pd.Index(sorted(work[group_col].unique()))\n",
" observed_table = pd.crosstab(work[category_col].astype(str), work[group_col]).reindex(\n",
" index=categories, columns=groups, fill_value=0\n",
" )\n",
" observed_chi2 = chi_square_independence(observed_table)['chi2']\n",
"\n",
" rng = np.random.default_rng(seed)\n",
" labels = work[group_col].to_numpy()\n",
" cats = work[category_col].astype(str).to_numpy()\n",
" more_extreme = 0\n",
" for _ in range(reps):\n",
" permuted = rng.permutation(labels)\n",
" table = pd.crosstab(cats, permuted).reindex(index=categories, columns=groups, fill_value=0)\n",
" stat = chi_square_independence(table)['chi2']\n",
" if stat >= observed_chi2:\n",
" more_extreme += 1\n",
" return {'permutation_p_value': float((more_extreme + 1) / (reps + 1)), 'permutation_reps': reps}\n",
"\n",
"\n",
"stats_dir = Path('output')\n",
"stats_dir.mkdir(exist_ok=True)\n",
"dc['is_clustered'] = ~dc['is_noise']\n",
"print(f'Clustered points: {dc[\"is_clustered\"].sum():,} / {len(dc):,} ({dc[\"is_clustered\"].mean():.1%})')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22",
"metadata": {},
"outputs": [],
"source": [
"# Overall concentration of data centers across DBSCAN clusters.\n",
"cluster_sizes = dc.loc[dc['is_clustered'], 'cluster_id'].value_counts().sort_values(ascending=False)\n",
"total_points = len(dc)\n",
"total_clustered = int(cluster_sizes.sum())\n",
"\n",
"stat_rows = [\n",
" {'metric': 'total_points', 'value': total_points},\n",
" {'metric': 'clustered_points', 'value': total_clustered},\n",
" {'metric': 'noise_points', 'value': int((~dc['is_clustered']).sum())},\n",
" {'metric': 'clustered_share', 'value': dc['is_clustered'].mean()},\n",
" {'metric': 'cluster_count', 'value': int(cluster_sizes.size)},\n",
" {'metric': 'largest_cluster_points', 'value': int(cluster_sizes.iloc[0]) if len(cluster_sizes) else 0},\n",
" {'metric': 'largest_cluster_share_of_all_points', 'value': float(cluster_sizes.iloc[0] / total_points) if len(cluster_sizes) else 0},\n",
" {'metric': 'largest_cluster_share_of_clustered_points', 'value': float(cluster_sizes.iloc[0] / total_clustered) if total_clustered else 0},\n",
" {'metric': 'top_5_clusters_share_of_clustered_points', 'value': float(cluster_sizes.head(5).sum() / total_clustered) if total_clustered else 0},\n",
" {'metric': 'cluster_size_gini', 'value': gini(cluster_sizes)},\n",
" {'metric': 'cluster_size_hhi', 'value': hhi_from_counts(cluster_sizes)},\n",
" {'metric': 'cluster_size_normalized_entropy', 'value': normalized_entropy_from_counts(cluster_sizes)},\n",
"]\n",
"cluster_concentration_stats = pd.DataFrame(stat_rows)\n",
"display(cluster_concentration_stats)\n",
"\n",
"cluster_concentration_stats.to_csv(stats_dir / 'master_data_center_cluster_concentration_stats.csv', index=False)\n",
"print('Wrote:', stats_dir / 'master_data_center_cluster_concentration_stats.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23",
"metadata": {},
"outputs": [],
"source": [
"# Compare clustered vs. isolated/noise points.\n",
"numeric_cols = ['nearest_neighbor_km', 'area_sqft', 'power_mw']\n",
"clustered_vs_noise_numeric = numeric_summary_by_group(dc, 'is_clustered', numeric_cols)\n",
"display(clustered_vs_noise_numeric)\n",
"\n",
"permutation_tests = pd.DataFrame([\n",
" permutation_mean_diff(dc, 'nearest_neighbor_km'),\n",
" permutation_mean_diff(dc, 'area_sqft'),\n",
" permutation_mean_diff(dc, 'power_mw'),\n",
"])\n",
"display(permutation_tests)\n",
"\n",
"clustered_vs_noise_numeric.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv', index=False)\n",
"permutation_tests.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv', index=False)\n",
"print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv')\n",
"print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24",
"metadata": {},
"outputs": [],
"source": [
"# Source enrichment: are curated/merged/OSM rows clustered at different rates?\n",
"source_table = pd.crosstab(dc['source'], dc['is_clustered'])\n",
"source_table.columns = ['noise_or_isolated', 'clustered'] if source_table.shape[1] == 2 else source_table.columns\n",
"source_rates = source_table.copy()\n",
"source_rates['total'] = source_rates.sum(axis=1)\n",
"source_rates['clustered_share'] = source_rates.get('clustered', 0) / source_rates['total']\n",
"source_rates = source_rates.sort_values('clustered_share', ascending=False)\n",
"\n",
"test = chi_square_independence(source_table)\n",
"perm_test = categorical_permutation_chi_square(dc, 'source', 'is_clustered')\n",
"source_test = pd.DataFrame([{'test': 'source_by_clustered', **test, **perm_test}])\n",
"\n",
"display(source_rates)\n",
"display(source_test)\n",
"\n",
"source_rates.reset_index().to_csv(stats_dir / 'master_data_center_source_cluster_enrichment.csv', index=False)\n",
"source_test.to_csv(stats_dir / 'master_data_center_source_cluster_chi_square.csv', index=False)\n",
"print('Wrote:', stats_dir / 'master_data_center_source_cluster_enrichment.csv')\n",
"print('Wrote:', stats_dir / 'master_data_center_source_cluster_chi_square.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25",
"metadata": {},
"outputs": [],
"source": [
"# State-level clustering profile.\n",
"state_cluster_stats = (\n",
" dc.groupby('state')\n",
" .agg(\n",
" points=('master_id', 'count'),\n",
" clustered_points=('is_clustered', 'sum'),\n",
" clusters=('cluster_id', lambda s: s[s.ne(-1)].nunique()),\n",
" median_nearest_neighbor_km=('nearest_neighbor_km', 'median'),\n",
" mean_nearest_neighbor_km=('nearest_neighbor_km', 'mean'),\n",
" known_power_rows=('power_mw', lambda s: s.notna().sum()),\n",
" total_power_mw_known=('power_mw', 'sum'),\n",
" known_area_rows=('area_sqft', lambda s: s.notna().sum()),\n",
" total_area_sqft_known=('area_sqft', 'sum'),\n",
" )\n",
" .reset_index()\n",
")\n",
"state_cluster_stats['clustered_share'] = state_cluster_stats['clustered_points'] / state_cluster_stats['points']\n",
"state_cluster_stats = state_cluster_stats.sort_values(['points', 'clustered_share'], ascending=[False, False])\n",
"display(state_cluster_stats.head(30))\n",
"\n",
"state_cluster_stats.to_csv(stats_dir / 'master_data_center_state_cluster_statistics.csv', index=False)\n",
"print('Wrote:', stats_dir / 'master_data_center_state_cluster_statistics.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26",
"metadata": {},
"outputs": [],
"source": [
"# Optional plots for the statistical summaries.\n",
"if HAS_MATPLOTLIB:\n",
" fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
" cluster_sizes.head(25).sort_values().plot(kind='barh', ax=axes[0])\n",
" axes[0].set_title('Largest DBSCAN Clusters')\n",
" axes[0].set_xlabel('data centers')\n",
" axes[0].set_ylabel('cluster_id')\n",
"\n",
" dc.boxplot(column='nearest_neighbor_km', by='is_clustered', ax=axes[1], showfliers=False)\n",
" axes[1].set_title('Nearest-Neighbor Distance by Cluster Status')\n",
" axes[1].set_xlabel('is_clustered')\n",
" axes[1].set_ylabel('km')\n",
" fig.suptitle('')\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" top_states = state_cluster_stats.head(20).sort_values('points')\n",
" fig, ax = plt.subplots(figsize=(9, 7))\n",
" ax.barh(top_states['state'], top_states['points'], label='all points', color='#9ca3af')\n",
" ax.barh(top_states['state'], top_states['clustered_points'], label='clustered points', color='#2563eb')\n",
" ax.set_title('Clustered Data Centers by State')\n",
" ax.set_xlabel('data centers')\n",
" ax.legend()\n",
" plt.show()\n",
"else:\n",
" print('matplotlib is not installed; skipping statistical plots.')"
]
},
{
"cell_type": "markdown",
"id": "27",
"metadata": {},
"source": [
"## Optional Interactive Map\n",
"\n",
"If `folium` is installed, this cell renders an interactive map. Large datasets can be slow, so the popup text is intentionally compact."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" import folium\n",
" from html import escape\n",
"\n",
" if 'dc' not in globals():\n",
" dc = pd.read_csv('output/master_data_center_spatial_cluster_points.csv')\n",
" if 'clusters' not in globals():\n",
" clusters = pd.read_csv('output/master_data_center_spatial_cluster_summary.csv')\n",
" if 'n_clusters' not in globals():\n",
" n_clusters = dc.loc[dc['cluster_id'].ne(-1), 'cluster_id'].nunique()\n",
" if 'is_noise' not in dc.columns:\n",
" dc['is_noise'] = dc['cluster_id'].eq(-1)\n",
"\n",
" m = folium.Map(location=[39, -98], zoom_start=4, tiles='CartoDB positron')\n",
"\n",
" cluster_info = {}\n",
" if not clusters.empty:\n",
" ranked_clusters = clusters.reset_index(drop=True).copy()\n",
" ranked_clusters['cluster_rank'] = ranked_clusters.index + 1\n",
" cluster_info = ranked_clusters.set_index('cluster_id').to_dict('index')\n",
"\n",
" cluster_colors = [\n",
" '#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2',\n",
" '#be123c', '#4f46e5', '#65a30d', '#c026d3', '#0f766e', '#b45309',\n",
" ]\n",
"\n",
" def clean_popup_value(value):\n",
" if pd.isna(value):\n",
" return ''\n",
" return escape(str(value))\n",
"\n",
" for row in dc.itertuples(index=False):\n",
" info = cluster_info.get(row.cluster_id, {})\n",
" if row.cluster_id == -1:\n",
" color = '#9ca3af'\n",
" radius = 3\n",
" cluster_label = 'Noise / isolated'\n",
" cluster_size = '1'\n",
" cluster_rank = ''\n",
" else:\n",
" rank = int(info.get('cluster_rank', row.cluster_id + 1))\n",
" color = cluster_colors[(rank - 1) % len(cluster_colors)]\n",
" radius = 5\n",
" cluster_label = f'Cluster ID {row.cluster_id}'\n",
" cluster_size = f\"{int(info.get('point_count', 0)):,}\"\n",
" cluster_rank = f\"Rank {rank} of {n_clusters} by size\"\n",
"\n",
" popup_html = f'''\n",
" <div style=\"font-family: system-ui, sans-serif; min-width: 260px;\">\n",
" <strong>{clean_popup_value(row.name) or clean_popup_value(row.master_id)}</strong><br>\n",
" {clean_popup_value(row.city)}, {clean_popup_value(row.state)}<br>\n",
" <hr style=\"margin: 6px 0;\">\n",
" <strong>{cluster_label}</strong><br>\n",
" {cluster_rank}<br>\n",
" Cluster size: {cluster_size} data center(s)<br>\n",
" Source: {clean_popup_value(row.source)}<br>\n",
" Operator: {clean_popup_value(row.operator)}<br>\n",
" Nearest neighbor: {row.nearest_neighbor_km:.2f} km\n",
" </div>\n",
" '''\n",
" popup = folium.Popup(popup_html, max_width=360)\n",
"\n",
" folium.CircleMarker(\n",
" location=[row.latitude, row.longitude],\n",
" radius=radius,\n",
" color=color,\n",
" fill=True,\n",
" fill_opacity=0.75,\n",
" weight=1,\n",
" popup=popup,\n",
" tooltip=f'{cluster_label}; size={cluster_size}',\n",
" ).add_to(m)\n",
" display(m)\n",
"except ImportError:\n",
" print('folium is not installed; skipping interactive map.')\n"
]
},
{
"cell_type": "markdown",
"id": "29",
"metadata": {},
"source": [
"## Export Results\n",
"\n",
"The first cell writes local CSVs. The second cell can write a compact cluster-label table back to PostGIS. Keep `WRITE_BACK_TO_DB = False` until you are happy with the selected parameters."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30",
"metadata": {},
"outputs": [],
"source": [
"points_out = Path('output/master_data_center_spatial_cluster_points.csv')\n",
"clusters_out = Path('output/master_data_center_spatial_cluster_summary.csv')\n",
"\n",
"dc.to_csv(points_out, index=False)\n",
"clusters.to_csv(clusters_out, index=False)\n",
"\n",
"print('Wrote:', points_out)\n",
"print('Wrote:', clusters_out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "31",
"metadata": {},
"outputs": [],
"source": [
"WRITE_BACK_TO_DB = False\n",
"\n",
"if WRITE_BACK_TO_DB:\n",
" rows = [\n",
" (\n",
" r.master_id,\n",
" int(r.cluster_id),\n",
" bool(r.is_noise),\n",
" float(EPS_KM),\n",
" int(MIN_SAMPLES),\n",
" float(r.nearest_neighbor_km) if pd.notna(r.nearest_neighbor_km) else None,\n",
" )\n",
" for r in dc.itertuples(index=False)\n",
" ]\n",
"\n",
" with get_conn() as conn:\n",
" with conn.cursor() as cur:\n",
" cur.execute(sql.SQL('drop table if exists {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
" cur.execute(sql.SQL('''\n",
" create table {} (\n",
" master_id text primary key references public.master_data_centers(master_id),\n",
" cluster_id integer not null,\n",
" is_noise boolean not null,\n",
" eps_km double precision not null,\n",
" min_samples integer not null,\n",
" nearest_neighbor_km double precision,\n",
" created_at timestamptz not null default now()\n",
" )\n",
" ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
" execute_values(\n",
" cur,\n",
" sql.SQL('''\n",
" insert into {} (\n",
" master_id, cluster_id, is_noise, eps_km, min_samples, nearest_neighbor_km\n",
" ) values %s\n",
" ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)).as_string(conn),\n",
" rows,\n",
" page_size=5000,\n",
" )\n",
" cur.execute(sql.SQL('create index on {} (cluster_id)').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
" cur.execute(sql.SQL('analyze {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n",
" print(f'Wrote {len(rows):,} rows to {CLUSTER_OUTPUT_TABLE}')\n",
"else:\n",
" print('WRITE_BACK_TO_DB is False; no database table was modified.')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}