{ "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "# Spatial Clustering of Master Data Centers\n", "\n", "This notebook analyzes spatial clustering in `public.master_data_centers`, the merged curated + OSM table built by `build_master_data_centers.py`.\n", "\n", "Main workflow:\n", "- Load data-center points from PostGIS.\n", "- Diagnose nearest-neighbor distances and spatial concentration.\n", "- Run DBSCAN with haversine distances so distances are in kilometers rather than degrees.\n", "- Sweep clustering parameters to check sensitivity.\n", "- Summarize clusters by geography, source, operator, and scale.\n", "- Optionally write cluster labels back to PostGIS for downstream mapping or joins.\n", "\n", "Expected environment variables: `PGWEB_HOST`, `PGWEB_PORT`, `PGWEB_USER`, `PGWEB_PASSWORD`. Optional: `PGDATABASE`, which defaults to `data_centers`." ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')\n", "Path(os.environ['MPLCONFIGDIR']).mkdir(parents=True, exist_ok=True)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import psycopg2\n", "from psycopg2 import sql\n", "from psycopg2.extras import execute_values\n", "\n", "try:\n", " import matplotlib.pyplot as plt\n", " HAS_MATPLOTLIB = True\n", "except ImportError:\n", " plt = None\n", " HAS_MATPLOTLIB = False\n", "\n", "pd.set_option('display.max_columns', 80)\n", "pd.set_option('display.max_rows', 120)\n", "if HAS_MATPLOTLIB:\n", " plt.rcParams['figure.figsize'] = (10, 6)\n", "\n", "EARTH_RADIUS_KM = 6371.0088\n", "\n", "print('pandas:', pd.__version__)\n", "print('numpy:', np.__version__)\n", "print('matplotlib:', 'ok' if HAS_MATPLOTLIB else 'not installed; plot cells will be skipped')" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "def load_env_file(env_path: str = '.env') -> None:\n", " p = Path(env_path)\n", " if not p.exists():\n", " print(f'No {env_path} file found in {Path.cwd()}')\n", " return\n", "\n", " loaded = 0\n", " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", " line = raw_line.strip()\n", " if not line or line.startswith('#') or '=' not in line:\n", " continue\n", " key, value = line.split('=', 1)\n", " key = key.strip()\n", " value = value.strip().strip('\"').strip(\"'\")\n", " if key and key not in os.environ:\n", " os.environ[key] = value\n", " loaded += 1\n", " print(f'Loaded {loaded} env var(s) from {env_path}')\n", "\n", "\n", "def require_env(keys):\n", " missing = [k for k in keys if not os.getenv(k)]\n", " if missing:\n", " raise EnvironmentError(\n", " 'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n", " '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n", " )\n", "\n", "\n", "load_env_file('.env')\n", "print('PGWEB_HOST:', os.getenv('PGWEB_HOST', ''))\n", "print('PGWEB_PORT:', os.getenv('PGWEB_PORT', ''))\n", "print('PGWEB_USER:', os.getenv('PGWEB_USER', ''))\n", "\n", "# Connection setup: mirrors postgis_table_loader.ipynb and the existing scripts in this repository.\n", "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n", "require_env(required_keys)\n", "\n", "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", "MASTER_TABLE = 'public.master_data_centers'\n", "CLUSTER_OUTPUT_TABLE = 'public.master_data_center_spatial_clusters'\n", "\n", "\n", "def get_conn():\n", " return psycopg2.connect(\n", " host=os.environ['PGWEB_HOST'],\n", " port=os.environ['PGWEB_PORT'],\n", " user=os.environ['PGWEB_USER'],\n", " password=os.environ['PGWEB_PASSWORD'],\n", " dbname=\"data_centers\",\n", " )\n", "\n", "\n", "with get_conn() as conn:\n", " with conn.cursor() as cur:\n", " cur.execute('select current_database(), current_user, version()')\n", " db, usr, ver = cur.fetchone()\n", " print('Connected to DB:', db)\n", " print('As user:', usr)\n", " print('Postgres:', ver.split(',')[0])\n", " cur.execute('create extension if not exists postgis')\n", " print('PostGIS extension is available')\n", " cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n", " if cur.fetchone()[0] is None:\n", " raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n", " cur.execute('select count(*) from public.master_data_centers')\n", " print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')" ] }, { "cell_type": "markdown", "id": "3", "metadata": {}, "source": [ "## Analysis Parameters\n", "\n", "`EPS_KM` is the neighborhood radius for DBSCAN. `MIN_SAMPLES` is the minimum number of data centers needed in that radius to form a dense cluster. Treat these as research choices: the parameter sweep below helps show how stable the cluster structure is." ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": {}, "outputs": [], "source": [ "EPS_KM = 25\n", "MIN_SAMPLES = 5\n", "PROJECTION_SRID = 5070 # NAD83 / Conus Albers; meters, good for contiguous US clustering.\n", "\n", "# Lower 48-ish bounding box. Set to False if you want Alaska, Hawaii, PR, etc. included.\n", "FILTER_CONTIGUOUS_US = True\n", "\n", "SWEEP_EPS_KM = [2, 5, 10, 15, 25, 50, 100]\n", "SWEEP_MIN_SAMPLES = [3, 5, 10]\n", "\n", "print(f'DBSCAN eps={EPS_KM} km, min_samples={MIN_SAMPLES}')" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "def master_points_where(alias: str = '') -> str:\n", " prefix = f'{alias}.' if alias else ''\n", " clauses = [\n", " f'{prefix}longitude is not null',\n", " f'{prefix}latitude is not null',\n", " f'{prefix}geom is not null',\n", " ]\n", " if FILTER_CONTIGUOUS_US:\n", " clauses.extend([\n", " f'{prefix}longitude between -125 and -66',\n", " f'{prefix}latitude between 24 and 50',\n", " ])\n", " return ' and '.join(clauses)\n", "\n", "\n", "sql_query = f\"\"\"\n", "select\n", " master_id,\n", " source,\n", " curated_id,\n", " osm_id,\n", " name,\n", " operator,\n", " city,\n", " state,\n", " country,\n", " power_mw,\n", " area_sqft,\n", " has_bare_metal,\n", " has_iaas,\n", " has_internet_exchange,\n", " has_colocation,\n", " longitude,\n", " latitude,\n", " ST_AsText(geom) as geom_wkt\n", "from public.master_data_centers\n", "where {master_points_where()}\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " dc = pd.read_sql_query(sql_query, conn)\n", "\n", "dc['longitude'] = pd.to_numeric(dc['longitude'], errors='coerce')\n", "dc['latitude'] = pd.to_numeric(dc['latitude'], errors='coerce')\n", "dc = dc.dropna(subset=['longitude', 'latitude']).copy()\n", "\n", "dc['state'] = dc['state'].fillna('UNK').replace('', 'UNK')\n", "dc['operator'] = dc['operator'].fillna('Unknown').replace('', 'Unknown')\n", "dc['city'] = dc['city'].fillna('Unknown').replace('', 'Unknown')\n", "\n", "print(f'Loaded {len(dc):,} points for clustering')\n", "display(dc.head())\n", "display(dc['source'].value_counts(dropna=False).rename_axis('source').reset_index(name='rows'))" ] }, { "cell_type": "markdown", "id": "6", "metadata": {}, "source": [ "## Baseline Geography\n", "\n", "Before clustering, inspect where the observations are and how concentrated they already are by state/city/source." ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "state_counts = (\n", " dc.groupby('state', dropna=False)\n", " .size()\n", " .sort_values(ascending=False)\n", " .rename('data_centers')\n", " .reset_index()\n", ")\n", "display(state_counts.head(25))\n", "\n", "city_counts = (\n", " dc.groupby(['state', 'city'], dropna=False)\n", " .size()\n", " .sort_values(ascending=False)\n", " .rename('data_centers')\n", " .reset_index()\n", ")\n", "display(city_counts.head(25))" ] }, { "cell_type": "code", "execution_count": null, "id": "8", "metadata": {}, "outputs": [], "source": [ "if HAS_MATPLOTLIB:\n", " fig, ax = plt.subplots(figsize=(11, 7))\n", " for source, group in dc.groupby('source'):\n", " ax.scatter(group['longitude'], group['latitude'], s=12, alpha=0.65, label=source)\n", " ax.set_title('Master Data Centers by Source')\n", " ax.set_xlabel('Longitude')\n", " ax.set_ylabel('Latitude')\n", " ax.legend(title='source')\n", " ax.grid(True, linewidth=0.3, alpha=0.4)\n", " plt.show()\n", "else:\n", " print('matplotlib is not installed; skipping static source scatter plot.')" ] }, { "cell_type": "markdown", "id": "9", "metadata": {}, "source": [ "## Nearest-Neighbor Distance Diagnostics\n", "\n", "Nearest-neighbor distances are useful for choosing DBSCAN parameters. If many points have a nearest neighbor inside 5-25 km, those radii will detect metro-scale clusters; larger radii begin to merge nearby metros and corridors. Distances are computed in PostGIS using geography meters." ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": {}, "outputs": [], "source": [ "nearest_neighbor_sql = f\"\"\"\n", "with points as (\n", " select\n", " master_id,\n", " geom,\n", " ST_Transform(geom, %s) as geom_m\n", " from public.master_data_centers\n", " where {master_points_where()}\n", ")\n", "select\n", " p.master_id,\n", " q.master_id as nearest_neighbor_id,\n", " ST_Distance(p.geom::geography, q.geom::geography) / 1000.0 as nearest_neighbor_km\n", "from points p\n", "left join lateral (\n", " select q.master_id, q.geom\n", " from points q\n", " where q.master_id <> p.master_id\n", " order by p.geom_m <-> q.geom_m\n", " limit 1\n", ") q on true\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " nn = pd.read_sql_query(nearest_neighbor_sql, conn, params=(PROJECTION_SRID,))\n", "\n", "dc = dc.drop(columns=[c for c in ['nearest_neighbor_id', 'nearest_neighbor_km'] if c in dc.columns])\n", "dc = dc.merge(nn, on='master_id', how='left')\n", "\n", "display(dc['nearest_neighbor_km'].describe(percentiles=[.05, .10, .25, .50, .75, .90, .95, .99]).to_frame())\n", "\n", "thresholds = [1, 2, 5, 10, 25, 50, 100]\n", "nn_thresholds = pd.DataFrame({\n", " 'within_km': thresholds,\n", " 'share_of_points': [(dc['nearest_neighbor_km'] <= km).mean() for km in thresholds],\n", " 'point_count': [(dc['nearest_neighbor_km'] <= km).sum() for km in thresholds],\n", "})\n", "display(nn_thresholds)\n", "\n", "if HAS_MATPLOTLIB:\n", " fig, ax = plt.subplots(figsize=(9, 5))\n", " dc['nearest_neighbor_km'].clip(upper=200).hist(bins=50, ax=ax)\n", " ax.axvline(EPS_KM, color='crimson', linewidth=2, label=f'EPS_KM={EPS_KM}')\n", " ax.set_title('Nearest-Neighbor Distance Distribution (clipped at 200 km)')\n", " ax.set_xlabel('Nearest neighbor distance, km')\n", " ax.set_ylabel('Data centers')\n", " ax.legend()\n", " plt.show()\n", "else:\n", " print('matplotlib is not installed; skipping nearest-neighbor histogram.')" ] }, { "cell_type": "markdown", "id": "11", "metadata": {}, "source": [ "## DBSCAN Clustering\n", "\n", "DBSCAN is a good first-pass method here because it finds irregularly shaped dense groups and labels isolated points as noise. The implementation below uses PostGIS `ST_ClusterDBSCAN` on `ST_Transform(geom, 5070)`, so `eps` is specified in meters after projecting to a continental-US equal-area CRS." ] }, { "cell_type": "code", "execution_count": null, "id": "12", "metadata": {}, "outputs": [], "source": [ "def load_dbscan_labels(eps_km: float, min_samples: int) -> pd.DataFrame:\n", " cluster_sql = f\"\"\"\n", " with points as (\n", " select\n", " master_id,\n", " ST_Transform(geom, %s) as geom_m\n", " from public.master_data_centers\n", " where {master_points_where()}\n", " ), clustered as (\n", " select\n", " master_id,\n", " ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as raw_cluster_id\n", " from points\n", " )\n", " select\n", " master_id,\n", " coalesce(raw_cluster_id, -1) as cluster_id\n", " from clustered\n", " \"\"\"\n", " with get_conn() as conn:\n", " labels = pd.read_sql_query(\n", " cluster_sql,\n", " conn,\n", " params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n", " )\n", " labels['cluster_id'] = labels['cluster_id'].astype(int)\n", " return labels\n", "\n", "\n", "labels = load_dbscan_labels(EPS_KM, MIN_SAMPLES)\n", "dc = dc.drop(columns=[c for c in ['cluster_id', 'is_noise'] if c in dc.columns])\n", "dc = dc.merge(labels, on='master_id', how='left')\n", "dc['cluster_id'] = dc['cluster_id'].fillna(-1).astype(int)\n", "dc['is_noise'] = dc['cluster_id'].eq(-1)\n", "\n", "n_clusters = dc.loc[~dc['is_noise'], 'cluster_id'].nunique()\n", "n_noise = int(dc['is_noise'].sum())\n", "print(f'Clusters found: {n_clusters:,}')\n", "print(f'Noise / isolated points: {n_noise:,} ({n_noise / len(dc):.1%})')\n", "display(dc['cluster_id'].value_counts().head(20).rename_axis('cluster_id').reset_index(name='points'))" ] }, { "cell_type": "code", "execution_count": null, "id": "13", "metadata": {}, "outputs": [], "source": [ "def haversine_km(lat1, lon1, lat2, lon2):\n", " lat1 = np.radians(lat1)\n", " lon1 = np.radians(lon1)\n", " lat2 = np.radians(lat2)\n", " lon2 = np.radians(lon2)\n", " dlat = lat2 - lat1\n", " dlon = lon2 - lon1\n", " a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2\n", " return 2 * EARTH_RADIUS_KM * np.arcsin(np.sqrt(a))\n", "\n", "\n", "def mode_list(values, n=5):\n", " s = pd.Series(values).dropna().astype(str)\n", " s = s[s.ne('')]\n", " if s.empty:\n", " return ''\n", " return '; '.join(f'{k} ({v})' for k, v in s.value_counts().head(n).items())\n", "\n", "\n", "cluster_summary_columns = [\n", " 'cluster_id', 'point_count', 'centroid_latitude', 'centroid_longitude',\n", " 'radius_km_p50', 'radius_km_p90', 'radius_km_max', 'states', 'cities',\n", " 'sources', 'operators', 'total_power_mw_known', 'known_power_rows',\n", " 'total_area_sqft_known', 'known_area_rows',\n", "]\n", "\n", "cluster_rows = []\n", "for cid, group in dc.loc[~dc['is_noise']].groupby('cluster_id'):\n", " centroid_lat = group['latitude'].mean()\n", " centroid_lon = group['longitude'].mean()\n", " distances = haversine_km(group['latitude'], group['longitude'], centroid_lat, centroid_lon)\n", " cluster_rows.append({\n", " 'cluster_id': int(cid),\n", " 'point_count': len(group),\n", " 'centroid_latitude': centroid_lat,\n", " 'centroid_longitude': centroid_lon,\n", " 'radius_km_p50': float(np.percentile(distances, 50)),\n", " 'radius_km_p90': float(np.percentile(distances, 90)),\n", " 'radius_km_max': float(np.max(distances)),\n", " 'states': mode_list(group['state'], n=8),\n", " 'cities': mode_list(group['city'], n=8),\n", " 'sources': mode_list(group['source'], n=4),\n", " 'operators': mode_list(group['operator'], n=8),\n", " 'total_power_mw_known': group['power_mw'].dropna().sum(),\n", " 'known_power_rows': group['power_mw'].notna().sum(),\n", " 'total_area_sqft_known': group['area_sqft'].dropna().sum(),\n", " 'known_area_rows': group['area_sqft'].notna().sum(),\n", " })\n", "\n", "if cluster_rows:\n", " clusters = pd.DataFrame(cluster_rows).sort_values(['point_count', 'radius_km_p90'], ascending=[False, True])\n", "else:\n", " clusters = pd.DataFrame(columns=cluster_summary_columns)\n", "\n", "display(clusters.head(30))" ] }, { "cell_type": "code", "execution_count": null, "id": "14", "metadata": {}, "outputs": [], "source": [ "if HAS_MATPLOTLIB:\n", " fig, ax = plt.subplots(figsize=(12, 8))\n", " noise = dc['is_noise']\n", " ax.scatter(dc.loc[noise, 'longitude'], dc.loc[noise, 'latitude'], s=8, c='lightgray', alpha=0.55, label='noise')\n", "\n", " clustered = dc.loc[~noise].copy()\n", " scatter = ax.scatter(\n", " clustered['longitude'],\n", " clustered['latitude'],\n", " c=clustered['cluster_id'],\n", " cmap='tab20',\n", " s=18,\n", " alpha=0.85,\n", " label='clustered',\n", " )\n", " ax.set_title(f'DBSCAN Data-Center Clusters (eps={EPS_KM} km, min_samples={MIN_SAMPLES})')\n", " ax.set_xlabel('Longitude')\n", " ax.set_ylabel('Latitude')\n", " ax.grid(True, linewidth=0.3, alpha=0.4)\n", " plt.colorbar(scatter, ax=ax, label='cluster_id')\n", " plt.show()\n", "else:\n", " print('matplotlib is not installed; skipping static cluster scatter plot.')" ] }, { "cell_type": "markdown", "id": "15", "metadata": {}, "source": [ "## Parameter Sensitivity\n", "\n", "The sweep below asks whether the cluster story depends heavily on a single `eps` or `min_samples` choice." ] }, { "cell_type": "code", "execution_count": null, "id": "16", "metadata": {}, "outputs": [], "source": [ "def dbscan_summary(eps_km: float, min_samples: int) -> dict:\n", " summary_sql = f\"\"\"\n", " with points as (\n", " select master_id, ST_Transform(geom, %s) as geom_m\n", " from public.master_data_centers\n", " where {master_points_where()}\n", " ), labels as (\n", " select ST_ClusterDBSCAN(geom_m, %s, %s) over (order by master_id) as cid\n", " from points\n", " ), cluster_sizes as (\n", " select cid, count(*)::integer as cluster_size\n", " from labels\n", " where cid is not null\n", " group by cid\n", " )\n", " select\n", " (select count(*)::integer from labels) as points,\n", " (select count(distinct cid)::integer from labels where cid is not null) as clusters,\n", " (select count(*)::integer from labels where cid is not null) as clustered_points,\n", " (select count(*)::integer from labels where cid is null) as noise_points,\n", " coalesce((select max(cluster_size)::integer from cluster_sizes), 0) as largest_cluster,\n", " coalesce((select percentile_cont(0.5) within group (order by cluster_size) from cluster_sizes), 0) as median_cluster_size\n", " \"\"\"\n", " with get_conn() as conn:\n", " row = pd.read_sql_query(\n", " summary_sql,\n", " conn,\n", " params=(PROJECTION_SRID, eps_km * 1000.0, min_samples),\n", " ).iloc[0].to_dict()\n", " row['eps_km'] = eps_km\n", " row['min_samples'] = min_samples\n", " row['clustered_share'] = row['clustered_points'] / row['points'] if row['points'] else 0\n", " return row\n", "\n", "\n", "sweep_rows = []\n", "for eps in SWEEP_EPS_KM:\n", " for min_samples in SWEEP_MIN_SAMPLES:\n", " sweep_rows.append(dbscan_summary(eps, min_samples))\n", "\n", "sweep = pd.DataFrame(sweep_rows)\n", "display(sweep)\n", "\n", "if HAS_MATPLOTLIB:\n", " fig, axes = plt.subplots(1, 2, figsize=(13, 5), sharex=True)\n", " for min_samples, group in sweep.groupby('min_samples'):\n", " axes[0].plot(group['eps_km'], group['clusters'], marker='o', label=f'min={min_samples}')\n", " axes[1].plot(group['eps_km'], group['clustered_share'], marker='o', label=f'min={min_samples}')\n", " axes[0].set_title('Cluster Count by eps')\n", " axes[0].set_ylabel('clusters')\n", " axes[1].set_title('Share of Points Clustered by eps')\n", " axes[1].set_ylabel('clustered share')\n", " for ax in axes:\n", " ax.set_xlabel('eps, km')\n", " ax.grid(True, linewidth=0.3, alpha=0.4)\n", " ax.legend()\n", " plt.show()\n", "else:\n", " print('matplotlib is not installed; skipping parameter-sweep line charts.')" ] }, { "cell_type": "markdown", "id": "17", "metadata": {}, "source": [ "## Cluster Profiles\n", "\n", "Use these tables to interpret clusters as local agglomerations, metro regions, corridors, or source-specific artifacts." ] }, { "cell_type": "code", "execution_count": null, "id": "18", "metadata": {}, "outputs": [], "source": [ "if clusters.empty:\n", " cluster_source = pd.DataFrame(columns=['cluster_id', 'point_count'])\n", "else:\n", " cluster_source = (\n", " dc.loc[~dc['is_noise']]\n", " .pivot_table(index='cluster_id', columns='source', values='master_id', aggfunc='count', fill_value=0)\n", " .reset_index()\n", " )\n", " cluster_source = clusters[['cluster_id', 'point_count']].merge(cluster_source, on='cluster_id', how='left')\n", "display(cluster_source.head(30))\n", "\n", "cluster_state = (\n", " dc.loc[~dc['is_noise']]\n", " .groupby(['cluster_id', 'state'])\n", " .size()\n", " .rename('points')\n", " .reset_index()\n", " .sort_values(['cluster_id', 'points'], ascending=[True, False])\n", ")\n", "display(cluster_state.head(80))" ] }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, "outputs": [], "source": [ "# Inspect the largest cluster in detail. Change CLUSTER_TO_INSPECT as needed.\n", "CLUSTER_TO_INSPECT = int(clusters.iloc[0]['cluster_id']) if len(clusters) else None\n", "\n", "if CLUSTER_TO_INSPECT is not None:\n", " cols = [\n", " 'master_id', 'name', 'operator', 'city', 'state', 'source',\n", " 'longitude', 'latitude', 'power_mw', 'area_sqft', 'nearest_neighbor_km'\n", " ]\n", " detail = (\n", " dc.loc[dc['cluster_id'].eq(CLUSTER_TO_INSPECT), cols]\n", " .sort_values(['state', 'city', 'operator', 'name'], na_position='last')\n", " )\n", " print(f'Cluster {CLUSTER_TO_INSPECT}: {len(detail):,} points')\n", " display(detail)\n", "else:\n", " print('No clusters found with current parameters.')" ] }, { "cell_type": "markdown", "id": "20", "metadata": {}, "source": [ "## Statistical Analysis\n", "\n", "This section treats the DBSCAN result as an analytical object: how concentrated are data centers across clusters, how different are clustered points from isolated points, and are some sources/states disproportionately represented in dense clusters?" ] }, { "cell_type": "code", "execution_count": null, "id": "21", "metadata": {}, "outputs": [], "source": [ "# Helper functions for concentration metrics and lightweight statistical tests.\n", "def gini(values):\n", " x = pd.Series(values).dropna().astype(float)\n", " x = x[x >= 0].sort_values().to_numpy()\n", " if len(x) == 0 or x.sum() == 0:\n", " return np.nan\n", " n = len(x)\n", " index = np.arange(1, n + 1)\n", " return float((2 * np.sum(index * x)) / (n * np.sum(x)) - (n + 1) / n)\n", "\n", "\n", "def hhi_from_counts(values):\n", " x = pd.Series(values).dropna().astype(float)\n", " total = x.sum()\n", " if total == 0:\n", " return np.nan\n", " shares = x / total\n", " return float(np.sum(shares ** 2))\n", "\n", "\n", "def normalized_entropy_from_counts(values):\n", " x = pd.Series(values).dropna().astype(float)\n", " x = x[x > 0]\n", " if len(x) <= 1:\n", " return 0.0 if len(x) == 1 else np.nan\n", " shares = x / x.sum()\n", " return float(-(shares * np.log(shares)).sum() / np.log(len(shares)))\n", "\n", "\n", "def numeric_summary_by_group(frame, group_col, value_cols):\n", " rows = []\n", " for col in value_cols:\n", " for group, vals in frame.groupby(group_col)[col]:\n", " vals = pd.to_numeric(vals, errors='coerce').dropna()\n", " rows.append({\n", " 'variable': col,\n", " group_col: group,\n", " 'n': int(vals.size),\n", " 'mean': float(vals.mean()) if vals.size else np.nan,\n", " 'median': float(vals.median()) if vals.size else np.nan,\n", " 'p25': float(vals.quantile(0.25)) if vals.size else np.nan,\n", " 'p75': float(vals.quantile(0.75)) if vals.size else np.nan,\n", " 'p90': float(vals.quantile(0.90)) if vals.size else np.nan,\n", " })\n", " return pd.DataFrame(rows)\n", "\n", "\n", "def permutation_mean_diff(frame, value_col, group_col='is_clustered', reps=5000, seed=42):\n", " work = frame[[value_col, group_col]].copy()\n", " work[value_col] = pd.to_numeric(work[value_col], errors='coerce')\n", " work = work.dropna()\n", " if work[group_col].nunique() != 2:\n", " return {'variable': value_col, 'observed_mean_diff': np.nan, 'p_two_sided': np.nan, 'reps': reps, 'n': len(work)}\n", "\n", " y = work[value_col].to_numpy(dtype=float)\n", " g = work[group_col].to_numpy(dtype=bool)\n", " observed = y[g].mean() - y[~g].mean()\n", " rng = np.random.default_rng(seed)\n", " more_extreme = 0\n", " for _ in range(reps):\n", " gp = rng.permutation(g)\n", " diff = y[gp].mean() - y[~gp].mean()\n", " if abs(diff) >= abs(observed):\n", " more_extreme += 1\n", " p = (more_extreme + 1) / (reps + 1)\n", " return {'variable': value_col, 'observed_mean_diff': float(observed), 'p_two_sided': float(p), 'reps': reps, 'n': len(work)}\n", "\n", "\n", "def chi_square_independence(table):\n", " observed = table.to_numpy(dtype=float)\n", " total = observed.sum()\n", " if observed.size == 0 or total == 0:\n", " return {'chi2': np.nan, 'dof': np.nan, 'p_value': np.nan, 'cramers_v': np.nan}\n", " row_totals = observed.sum(axis=1, keepdims=True)\n", " col_totals = observed.sum(axis=0, keepdims=True)\n", " expected = row_totals @ col_totals / total\n", " mask = expected > 0\n", " chi2 = float(((observed - expected) ** 2 / np.where(mask, expected, 1)).sum())\n", " dof = int((observed.shape[0] - 1) * (observed.shape[1] - 1))\n", " denom = total * max(1, min(observed.shape[0] - 1, observed.shape[1] - 1))\n", " cramers_v = float(np.sqrt(chi2 / denom)) if denom else np.nan\n", "\n", " p_value = np.nan\n", " try:\n", " from scipy.stats import chi2 as scipy_chi2\n", " p_value = float(scipy_chi2.sf(chi2, dof))\n", " except ImportError:\n", " pass\n", " return {'chi2': chi2, 'dof': dof, 'p_value': p_value, 'cramers_v': cramers_v}\n", "\n", "\n", "def categorical_permutation_chi_square(frame, category_col, group_col='is_clustered', reps=5000, seed=42):\n", " work = frame[[category_col, group_col]].dropna().copy()\n", " if work.empty or work[category_col].nunique() < 2 or work[group_col].nunique() < 2:\n", " return {'permutation_p_value': np.nan, 'permutation_reps': reps}\n", "\n", " categories = pd.Index(sorted(work[category_col].astype(str).unique()))\n", " groups = pd.Index(sorted(work[group_col].unique()))\n", " observed_table = pd.crosstab(work[category_col].astype(str), work[group_col]).reindex(\n", " index=categories, columns=groups, fill_value=0\n", " )\n", " observed_chi2 = chi_square_independence(observed_table)['chi2']\n", "\n", " rng = np.random.default_rng(seed)\n", " labels = work[group_col].to_numpy()\n", " cats = work[category_col].astype(str).to_numpy()\n", " more_extreme = 0\n", " for _ in range(reps):\n", " permuted = rng.permutation(labels)\n", " table = pd.crosstab(cats, permuted).reindex(index=categories, columns=groups, fill_value=0)\n", " stat = chi_square_independence(table)['chi2']\n", " if stat >= observed_chi2:\n", " more_extreme += 1\n", " return {'permutation_p_value': float((more_extreme + 1) / (reps + 1)), 'permutation_reps': reps}\n", "\n", "\n", "stats_dir = Path('output')\n", "stats_dir.mkdir(exist_ok=True)\n", "dc['is_clustered'] = ~dc['is_noise']\n", "print(f'Clustered points: {dc[\"is_clustered\"].sum():,} / {len(dc):,} ({dc[\"is_clustered\"].mean():.1%})')" ] }, { "cell_type": "code", "execution_count": null, "id": "22", "metadata": {}, "outputs": [], "source": [ "# Overall concentration of data centers across DBSCAN clusters.\n", "cluster_sizes = dc.loc[dc['is_clustered'], 'cluster_id'].value_counts().sort_values(ascending=False)\n", "total_points = len(dc)\n", "total_clustered = int(cluster_sizes.sum())\n", "\n", "stat_rows = [\n", " {'metric': 'total_points', 'value': total_points},\n", " {'metric': 'clustered_points', 'value': total_clustered},\n", " {'metric': 'noise_points', 'value': int((~dc['is_clustered']).sum())},\n", " {'metric': 'clustered_share', 'value': dc['is_clustered'].mean()},\n", " {'metric': 'cluster_count', 'value': int(cluster_sizes.size)},\n", " {'metric': 'largest_cluster_points', 'value': int(cluster_sizes.iloc[0]) if len(cluster_sizes) else 0},\n", " {'metric': 'largest_cluster_share_of_all_points', 'value': float(cluster_sizes.iloc[0] / total_points) if len(cluster_sizes) else 0},\n", " {'metric': 'largest_cluster_share_of_clustered_points', 'value': float(cluster_sizes.iloc[0] / total_clustered) if total_clustered else 0},\n", " {'metric': 'top_5_clusters_share_of_clustered_points', 'value': float(cluster_sizes.head(5).sum() / total_clustered) if total_clustered else 0},\n", " {'metric': 'cluster_size_gini', 'value': gini(cluster_sizes)},\n", " {'metric': 'cluster_size_hhi', 'value': hhi_from_counts(cluster_sizes)},\n", " {'metric': 'cluster_size_normalized_entropy', 'value': normalized_entropy_from_counts(cluster_sizes)},\n", "]\n", "cluster_concentration_stats = pd.DataFrame(stat_rows)\n", "display(cluster_concentration_stats)\n", "\n", "cluster_concentration_stats.to_csv(stats_dir / 'master_data_center_cluster_concentration_stats.csv', index=False)\n", "print('Wrote:', stats_dir / 'master_data_center_cluster_concentration_stats.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "23", "metadata": {}, "outputs": [], "source": [ "# Compare clustered vs. isolated/noise points.\n", "numeric_cols = ['nearest_neighbor_km', 'area_sqft', 'power_mw']\n", "clustered_vs_noise_numeric = numeric_summary_by_group(dc, 'is_clustered', numeric_cols)\n", "display(clustered_vs_noise_numeric)\n", "\n", "permutation_tests = pd.DataFrame([\n", " permutation_mean_diff(dc, 'nearest_neighbor_km'),\n", " permutation_mean_diff(dc, 'area_sqft'),\n", " permutation_mean_diff(dc, 'power_mw'),\n", "])\n", "display(permutation_tests)\n", "\n", "clustered_vs_noise_numeric.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv', index=False)\n", "permutation_tests.to_csv(stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv', index=False)\n", "print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_numeric_summary.csv')\n", "print('Wrote:', stats_dir / 'master_data_center_clustered_vs_noise_permutation_tests.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "24", "metadata": {}, "outputs": [], "source": [ "# Source enrichment: are curated/merged/OSM rows clustered at different rates?\n", "source_table = pd.crosstab(dc['source'], dc['is_clustered'])\n", "source_table.columns = ['noise_or_isolated', 'clustered'] if source_table.shape[1] == 2 else source_table.columns\n", "source_rates = source_table.copy()\n", "source_rates['total'] = source_rates.sum(axis=1)\n", "source_rates['clustered_share'] = source_rates.get('clustered', 0) / source_rates['total']\n", "source_rates = source_rates.sort_values('clustered_share', ascending=False)\n", "\n", "test = chi_square_independence(source_table)\n", "perm_test = categorical_permutation_chi_square(dc, 'source', 'is_clustered')\n", "source_test = pd.DataFrame([{'test': 'source_by_clustered', **test, **perm_test}])\n", "\n", "display(source_rates)\n", "display(source_test)\n", "\n", "source_rates.reset_index().to_csv(stats_dir / 'master_data_center_source_cluster_enrichment.csv', index=False)\n", "source_test.to_csv(stats_dir / 'master_data_center_source_cluster_chi_square.csv', index=False)\n", "print('Wrote:', stats_dir / 'master_data_center_source_cluster_enrichment.csv')\n", "print('Wrote:', stats_dir / 'master_data_center_source_cluster_chi_square.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "25", "metadata": {}, "outputs": [], "source": [ "# State-level clustering profile.\n", "state_cluster_stats = (\n", " dc.groupby('state')\n", " .agg(\n", " points=('master_id', 'count'),\n", " clustered_points=('is_clustered', 'sum'),\n", " clusters=('cluster_id', lambda s: s[s.ne(-1)].nunique()),\n", " median_nearest_neighbor_km=('nearest_neighbor_km', 'median'),\n", " mean_nearest_neighbor_km=('nearest_neighbor_km', 'mean'),\n", " known_power_rows=('power_mw', lambda s: s.notna().sum()),\n", " total_power_mw_known=('power_mw', 'sum'),\n", " known_area_rows=('area_sqft', lambda s: s.notna().sum()),\n", " total_area_sqft_known=('area_sqft', 'sum'),\n", " )\n", " .reset_index()\n", ")\n", "state_cluster_stats['clustered_share'] = state_cluster_stats['clustered_points'] / state_cluster_stats['points']\n", "state_cluster_stats = state_cluster_stats.sort_values(['points', 'clustered_share'], ascending=[False, False])\n", "display(state_cluster_stats.head(30))\n", "\n", "state_cluster_stats.to_csv(stats_dir / 'master_data_center_state_cluster_statistics.csv', index=False)\n", "print('Wrote:', stats_dir / 'master_data_center_state_cluster_statistics.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "26", "metadata": {}, "outputs": [], "source": [ "# Optional plots for the statistical summaries.\n", "if HAS_MATPLOTLIB:\n", " fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", " cluster_sizes.head(25).sort_values().plot(kind='barh', ax=axes[0])\n", " axes[0].set_title('Largest DBSCAN Clusters')\n", " axes[0].set_xlabel('data centers')\n", " axes[0].set_ylabel('cluster_id')\n", "\n", " dc.boxplot(column='nearest_neighbor_km', by='is_clustered', ax=axes[1], showfliers=False)\n", " axes[1].set_title('Nearest-Neighbor Distance by Cluster Status')\n", " axes[1].set_xlabel('is_clustered')\n", " axes[1].set_ylabel('km')\n", " fig.suptitle('')\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " top_states = state_cluster_stats.head(20).sort_values('points')\n", " fig, ax = plt.subplots(figsize=(9, 7))\n", " ax.barh(top_states['state'], top_states['points'], label='all points', color='#9ca3af')\n", " ax.barh(top_states['state'], top_states['clustered_points'], label='clustered points', color='#2563eb')\n", " ax.set_title('Clustered Data Centers by State')\n", " ax.set_xlabel('data centers')\n", " ax.legend()\n", " plt.show()\n", "else:\n", " print('matplotlib is not installed; skipping statistical plots.')" ] }, { "cell_type": "markdown", "id": "27", "metadata": {}, "source": [ "## Optional Interactive Map\n", "\n", "If `folium` is installed, this cell renders an interactive map. Large datasets can be slow, so the popup text is intentionally compact." ] }, { "cell_type": "code", "execution_count": null, "id": "28", "metadata": {}, "outputs": [], "source": [ "try:\n", " import folium\n", " from html import escape\n", "\n", " if 'dc' not in globals():\n", " dc = pd.read_csv('output/master_data_center_spatial_cluster_points.csv')\n", " if 'clusters' not in globals():\n", " clusters = pd.read_csv('output/master_data_center_spatial_cluster_summary.csv')\n", " if 'n_clusters' not in globals():\n", " n_clusters = dc.loc[dc['cluster_id'].ne(-1), 'cluster_id'].nunique()\n", " if 'is_noise' not in dc.columns:\n", " dc['is_noise'] = dc['cluster_id'].eq(-1)\n", "\n", " m = folium.Map(location=[39, -98], zoom_start=4, tiles='CartoDB positron')\n", "\n", " cluster_info = {}\n", " if not clusters.empty:\n", " ranked_clusters = clusters.reset_index(drop=True).copy()\n", " ranked_clusters['cluster_rank'] = ranked_clusters.index + 1\n", " cluster_info = ranked_clusters.set_index('cluster_id').to_dict('index')\n", "\n", " cluster_colors = [\n", " '#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2',\n", " '#be123c', '#4f46e5', '#65a30d', '#c026d3', '#0f766e', '#b45309',\n", " ]\n", "\n", " def clean_popup_value(value):\n", " if pd.isna(value):\n", " return ''\n", " return escape(str(value))\n", "\n", " for row in dc.itertuples(index=False):\n", " info = cluster_info.get(row.cluster_id, {})\n", " if row.cluster_id == -1:\n", " color = '#9ca3af'\n", " radius = 3\n", " cluster_label = 'Noise / isolated'\n", " cluster_size = '1'\n", " cluster_rank = ''\n", " else:\n", " rank = int(info.get('cluster_rank', row.cluster_id + 1))\n", " color = cluster_colors[(rank - 1) % len(cluster_colors)]\n", " radius = 5\n", " cluster_label = f'Cluster ID {row.cluster_id}'\n", " cluster_size = f\"{int(info.get('point_count', 0)):,}\"\n", " cluster_rank = f\"Rank {rank} of {n_clusters} by size\"\n", "\n", " popup_html = f'''\n", "
\n", " {clean_popup_value(row.name) or clean_popup_value(row.master_id)}
\n", " {clean_popup_value(row.city)}, {clean_popup_value(row.state)}
\n", "
\n", " {cluster_label}
\n", " {cluster_rank}
\n", " Cluster size: {cluster_size} data center(s)
\n", " Source: {clean_popup_value(row.source)}
\n", " Operator: {clean_popup_value(row.operator)}
\n", " Nearest neighbor: {row.nearest_neighbor_km:.2f} km\n", "
\n", " '''\n", " popup = folium.Popup(popup_html, max_width=360)\n", "\n", " folium.CircleMarker(\n", " location=[row.latitude, row.longitude],\n", " radius=radius,\n", " color=color,\n", " fill=True,\n", " fill_opacity=0.75,\n", " weight=1,\n", " popup=popup,\n", " tooltip=f'{cluster_label}; size={cluster_size}',\n", " ).add_to(m)\n", " map_out = Path('output/master_data_center_spatial_clusters_map.html')\n", " map_out.parent.mkdir(exist_ok=True)\n", " m.save(map_out)\n", " print('Wrote:', map_out)\n", " display(m)\n", "except ImportError:\n", " print('folium is not installed; skipping interactive map.')\n" ] }, { "cell_type": "markdown", "id": "29", "metadata": {}, "source": [ "## Export Results\n", "\n", "The first cell writes local CSVs. The second cell can write a compact cluster-label table back to PostGIS. Keep `WRITE_BACK_TO_DB = False` until you are happy with the selected parameters." ] }, { "cell_type": "code", "execution_count": null, "id": "30", "metadata": {}, "outputs": [], "source": [ "points_out = Path('output/master_data_center_spatial_cluster_points.csv')\n", "clusters_out = Path('output/master_data_center_spatial_cluster_summary.csv')\n", "\n", "dc.to_csv(points_out, index=False)\n", "clusters.to_csv(clusters_out, index=False)\n", "\n", "print('Wrote:', points_out)\n", "print('Wrote:', clusters_out)" ] }, { "cell_type": "code", "execution_count": null, "id": "31", "metadata": {}, "outputs": [], "source": [ "WRITE_BACK_TO_DB = False\n", "\n", "if WRITE_BACK_TO_DB:\n", " rows = [\n", " (\n", " r.master_id,\n", " int(r.cluster_id),\n", " bool(r.is_noise),\n", " float(EPS_KM),\n", " int(MIN_SAMPLES),\n", " float(r.nearest_neighbor_km) if pd.notna(r.nearest_neighbor_km) else None,\n", " )\n", " for r in dc.itertuples(index=False)\n", " ]\n", "\n", " with get_conn() as conn:\n", " with conn.cursor() as cur:\n", " cur.execute(sql.SQL('drop table if exists {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n", " cur.execute(sql.SQL('''\n", " create table {} (\n", " master_id text primary key references public.master_data_centers(master_id),\n", " cluster_id integer not null,\n", " is_noise boolean not null,\n", " eps_km double precision not null,\n", " min_samples integer not null,\n", " nearest_neighbor_km double precision,\n", " created_at timestamptz not null default now()\n", " )\n", " ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n", " execute_values(\n", " cur,\n", " sql.SQL('''\n", " insert into {} (\n", " master_id, cluster_id, is_noise, eps_km, min_samples, nearest_neighbor_km\n", " ) values %s\n", " ''').format(sql.SQL(CLUSTER_OUTPUT_TABLE)).as_string(conn),\n", " rows,\n", " page_size=5000,\n", " )\n", " cur.execute(sql.SQL('create index on {} (cluster_id)').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n", " cur.execute(sql.SQL('analyze {}').format(sql.SQL(CLUSTER_OUTPUT_TABLE)))\n", " print(f'Wrote {len(rows):,} rows to {CLUSTER_OUTPUT_TABLE}')\n", "else:\n", " print('WRITE_BACK_TO_DB is False; no database table was modified.')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.5" } }, "nbformat": 4, "nbformat_minor": 5 }