{ "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "# Enhanced Data Center Cluster Map\n", "\n", "This notebook starts from the spatial clustering outputs created by `spatial_clustering_master_data_centers.ipynb` and adds contextual layers from the demographic/RUCA/energy analysis.\n", "\n", "Current features:\n", "- Loads point and cluster summary CSVs from `output/`.\n", "- Recreates the cluster-colored Folium map.\n", "- Enriches point popups with HUC8 watershed, RUCA, tract demographics, and state energy context where available.\n", "- Adds separate layers for clustered points, isolated/noise points, cluster centroids, HUC8 watersheds, and state IM3 projected demand.\n", "- Saves a standalone HTML map to `output/enhanced_master_data_center_spatial_clusters_map.html`.\n", "\n", "Notes from `output/data_center_demographic_ruca_energy_summary.md`:\n", "- HUC8 watershed join is a recommended next step for water-context analysis.\n", "- `im3_state_projected_moderate_50` is populated and used for state projected demand context.\n", "- `seds_state_msn_year` is checked through the state context export, but it currently has no rows, so SEDS fields are blank until that table is populated.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from html import escape\n", "from pathlib import Path\n", "\n", "os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')\n", "Path(os.environ['MPLCONFIGDIR']).mkdir(parents=True, exist_ok=True)\n", "\n", "import pandas as pd\n", "import folium\n", "from folium import plugins\n", "\n", "print('pandas:', pd.__version__)\n", "print('folium:', folium.__version__)\n" ] }, { "cell_type": "markdown", "id": "2", "metadata": {}, "source": [ "## Paths And Display Settings" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": {}, "outputs": [], "source": [ "OUTPUT_DIR = Path('output')\n", "POINTS_CSV = OUTPUT_DIR / 'master_data_center_spatial_cluster_points.csv'\n", "CLUSTERS_CSV = OUTPUT_DIR / 'master_data_center_spatial_cluster_summary.csv'\n", "POINT_CONTEXT_CSV = OUTPUT_DIR / 'master_data_center_map_context.csv'\n", "HUC8_GEOJSON = OUTPUT_DIR / 'master_data_center_huc8_watersheds.geojson'\n", "STATE_ENERGY_CSV = OUTPUT_DIR / 'master_data_center_state_energy_context.csv'\n", "MAP_HTML = OUTPUT_DIR / 'enhanced_master_data_center_spatial_clusters_map.html'\n", "\n", "MAP_CENTER = [39, -98]\n", "MAP_ZOOM = 4\n", "BASE_TILES = 'CartoDB positron'\n", "\n", "MAX_POINTS = None\n", "\n", "CLUSTERED_RADIUS = 5\n", "NOISE_RADIUS = 3\n", "CENTROID_RADIUS = 7\n", "SHOW_CENTROID_P90_CIRCLES = True\n", "SHOW_HUC8_LAYER = True\n", "SHOW_STATE_ENERGY_LAYER = True\n", "\n", "OUTPUT_DIR.mkdir(exist_ok=True)\n", "print('points:', POINTS_CSV)\n", "print('clusters:', CLUSTERS_CSV)\n", "print('point context:', POINT_CONTEXT_CSV)\n", "print('HUC8 GeoJSON:', HUC8_GEOJSON)\n", "print('state energy context:', STATE_ENERGY_CSV)\n", "print('html output:', MAP_HTML)\n" ] }, { "cell_type": "markdown", "id": "4", "metadata": {}, "source": [ "## Load Cluster Outputs" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "required_files = [POINTS_CSV, CLUSTERS_CSV]\n", "missing = [str(p) for p in required_files if not p.exists()]\n", "if missing:\n", " raise FileNotFoundError('Missing required cluster output CSV(s): ' + ', '.join(missing))\n", "\n", "points = pd.read_csv(POINTS_CSV)\n", "clusters = pd.read_csv(CLUSTERS_CSV)\n", "point_context = pd.read_csv(POINT_CONTEXT_CSV) if POINT_CONTEXT_CSV.exists() else pd.DataFrame()\n", "state_energy = pd.read_csv(STATE_ENERGY_CSV) if STATE_ENERGY_CSV.exists() else pd.DataFrame()\n", "\n", "if MAX_POINTS is not None:\n", " points = points.head(MAX_POINTS).copy()\n", "\n", "points['cluster_id'] = pd.to_numeric(points['cluster_id'], errors='coerce').fillna(-1).astype(int)\n", "points['is_noise'] = points['cluster_id'].eq(-1)\n", "points['is_clustered'] = ~points['is_noise']\n", "points['name'] = points['name'].fillna('')\n", "points['operator'] = points['operator'].fillna('Unknown').replace('', 'Unknown')\n", "points['city'] = points['city'].fillna('Unknown').replace('', 'Unknown')\n", "points['state'] = points['state'].fillna('UNK').replace('', 'UNK')\n", "points['source'] = points['source'].fillna('unknown').replace('', 'unknown')\n", "\n", "if not point_context.empty:\n", " context_cols = [c for c in point_context.columns if c != 'master_id']\n", " points = points.merge(point_context[['master_id'] + context_cols], on='master_id', how='left')\n", "\n", "if not state_energy.empty:\n", " state_cols = [c for c in state_energy.columns if c != 'state_code']\n", " points = points.merge(state_energy[['state_code'] + state_cols], left_on='state', right_on='state_code', how='left')\n", "\n", "clusters['cluster_id'] = pd.to_numeric(clusters['cluster_id'], errors='coerce').astype(int)\n", "clusters = clusters.sort_values(['point_count', 'radius_km_p90'], ascending=[False, True]).reset_index(drop=True)\n", "clusters['cluster_rank'] = clusters.index + 1\n", "\n", "huc8_geojson = None\n", "if HUC8_GEOJSON.exists():\n", " huc8_geojson = json.loads(HUC8_GEOJSON.read_text())\n", "\n", "n_clusters = points.loc[points['cluster_id'].ne(-1), 'cluster_id'].nunique()\n", "print(f'Loaded {len(points):,} points and {n_clusters:,} clusters')\n", "print('point context columns:', 0 if point_context.empty else len(point_context.columns))\n", "print('HUC8 features:', 0 if huc8_geojson is None else len(huc8_geojson.get('features', [])))\n", "if not state_energy.empty:\n", " seds_available = state_energy['seds_series_count'].notna().sum() if 'seds_series_count' in state_energy.columns else 0\n", " print(f'state energy rows: {len(state_energy):,}; SEDS rows represented: {seds_available:,}')\n", "else:\n", " print('state energy context file not found')\n", "display(points.head())\n", "display(clusters.head(10))\n", "if not state_energy.empty:\n", " display(state_energy.head(10))\n" ] }, { "cell_type": "markdown", "id": "6", "metadata": {}, "source": [ "## Map Helpers" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "CLUSTER_COLORS = [\n", " '#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2',\n", " '#be123c', '#4f46e5', '#65a30d', '#c026d3', '#0f766e', '#b45309',\n", "]\n", "NOISE_COLOR = '#9ca3af'\n", "CENTROID_COLOR = '#111827'\n", "STATE_ENERGY_COLOR = '#f59e0b'\n", "\n", "cluster_info = clusters.set_index('cluster_id').to_dict('index')\n", "\n", "\n", "def clean_value(value):\n", " if pd.isna(value):\n", " return ''\n", " return escape(str(value))\n", "\n", "\n", "def fmt_number(value, decimals=0, prefix='', suffix=''):\n", " if pd.isna(value):\n", " return ''\n", " try:\n", " value = float(value)\n", " except (TypeError, ValueError):\n", " return clean_value(value)\n", " return f\"{prefix}{value:,.{decimals}f}{suffix}\"\n", "\n", "\n", "def cluster_color(cluster_id):\n", " if cluster_id == -1:\n", " return NOISE_COLOR\n", " info = cluster_info.get(cluster_id, {})\n", " rank = int(info.get('cluster_rank', cluster_id + 1))\n", " return CLUSTER_COLORS[(rank - 1) % len(CLUSTER_COLORS)]\n", "\n", "\n", "def cluster_label_and_size(cluster_id):\n", " if cluster_id == -1:\n", " return 'Noise / isolated', '1', ''\n", " info = cluster_info.get(cluster_id, {})\n", " rank = int(info.get('cluster_rank', cluster_id + 1))\n", " point_count = int(info.get('point_count', 0))\n", " return f'Cluster ID {cluster_id}', f'{point_count:,}', f'Rank {rank} of {n_clusters} by size'\n", "\n", "\n", "def point_popup(row):\n", " cluster_label, cluster_size, cluster_rank = cluster_label_and_size(row.cluster_id)\n", " nearest = row.nearest_neighbor_km\n", " nearest_text = f'{nearest:.2f} km' if pd.notna(nearest) else ''\n", " title = clean_value(row.name) or clean_value(row.master_id)\n", "\n", " huc8_lines = ''\n", " if hasattr(row, 'huc8') and pd.notna(row.huc8):\n", " huc8_lines = f'''\n", "