Files
data-centers/enhanced_data_center_cluster_map.ipynb

528 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "0",
"metadata": {},
"source": [
"# Enhanced Data Center Cluster Map\n",
"\n",
"This notebook starts from the spatial clustering outputs created by `spatial_clustering_master_data_centers.ipynb` and adds contextual layers from the demographic/RUCA/energy analysis.\n",
"\n",
"Current features:\n",
"- Loads point and cluster summary CSVs from `output/`.\n",
"- Recreates the cluster-colored Folium map.\n",
"- Enriches point popups with HUC8 watershed, RUCA, tract demographics, and state energy context where available.\n",
"- Adds separate layers for clustered points, isolated/noise points, cluster centroids, HUC8 watersheds, and state IM3 projected demand.\n",
"- Saves a standalone HTML map to `output/enhanced_master_data_center_spatial_clusters_map.html`.\n",
"\n",
"Notes from `output/data_center_demographic_ruca_energy_summary.md`:\n",
"- HUC8 watershed join is a recommended next step for water-context analysis.\n",
"- `im3_state_projected_moderate_50` is populated and used for state projected demand context.\n",
"- `seds_state_msn_year` is checked through the state context export, but it currently has no rows, so SEDS fields are blank until that table is populated.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from html import escape\n",
"from pathlib import Path\n",
"\n",
"os.environ.setdefault('MPLCONFIGDIR', '/tmp/matplotlib')\n",
"Path(os.environ['MPLCONFIGDIR']).mkdir(parents=True, exist_ok=True)\n",
"\n",
"import pandas as pd\n",
"import folium\n",
"from folium import plugins\n",
"\n",
"print('pandas:', pd.__version__)\n",
"print('folium:', folium.__version__)\n"
]
},
{
"cell_type": "markdown",
"id": "2",
"metadata": {},
"source": [
"## Paths And Display Settings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3",
"metadata": {},
"outputs": [],
"source": [
"OUTPUT_DIR = Path('output')\n",
"POINTS_CSV = OUTPUT_DIR / 'master_data_center_spatial_cluster_points.csv'\n",
"CLUSTERS_CSV = OUTPUT_DIR / 'master_data_center_spatial_cluster_summary.csv'\n",
"POINT_CONTEXT_CSV = OUTPUT_DIR / 'master_data_center_map_context.csv'\n",
"HUC8_GEOJSON = OUTPUT_DIR / 'master_data_center_huc8_watersheds.geojson'\n",
"STATE_ENERGY_CSV = OUTPUT_DIR / 'master_data_center_state_energy_context.csv'\n",
"MAP_HTML = OUTPUT_DIR / 'enhanced_master_data_center_spatial_clusters_map.html'\n",
"\n",
"MAP_CENTER = [39, -98]\n",
"MAP_ZOOM = 4\n",
"BASE_TILES = 'CartoDB positron'\n",
"\n",
"MAX_POINTS = None\n",
"\n",
"CLUSTERED_RADIUS = 5\n",
"NOISE_RADIUS = 3\n",
"CENTROID_RADIUS = 7\n",
"SHOW_CENTROID_P90_CIRCLES = True\n",
"SHOW_HUC8_LAYER = True\n",
"SHOW_STATE_ENERGY_LAYER = True\n",
"\n",
"OUTPUT_DIR.mkdir(exist_ok=True)\n",
"print('points:', POINTS_CSV)\n",
"print('clusters:', CLUSTERS_CSV)\n",
"print('point context:', POINT_CONTEXT_CSV)\n",
"print('HUC8 GeoJSON:', HUC8_GEOJSON)\n",
"print('state energy context:', STATE_ENERGY_CSV)\n",
"print('html output:', MAP_HTML)\n"
]
},
{
"cell_type": "markdown",
"id": "4",
"metadata": {},
"source": [
"## Load Cluster Outputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"required_files = [POINTS_CSV, CLUSTERS_CSV]\n",
"missing = [str(p) for p in required_files if not p.exists()]\n",
"if missing:\n",
" raise FileNotFoundError('Missing required cluster output CSV(s): ' + ', '.join(missing))\n",
"\n",
"points = pd.read_csv(POINTS_CSV)\n",
"clusters = pd.read_csv(CLUSTERS_CSV)\n",
"point_context = pd.read_csv(POINT_CONTEXT_CSV) if POINT_CONTEXT_CSV.exists() else pd.DataFrame()\n",
"state_energy = pd.read_csv(STATE_ENERGY_CSV) if STATE_ENERGY_CSV.exists() else pd.DataFrame()\n",
"\n",
"if MAX_POINTS is not None:\n",
" points = points.head(MAX_POINTS).copy()\n",
"\n",
"points['cluster_id'] = pd.to_numeric(points['cluster_id'], errors='coerce').fillna(-1).astype(int)\n",
"points['is_noise'] = points['cluster_id'].eq(-1)\n",
"points['is_clustered'] = ~points['is_noise']\n",
"points['name'] = points['name'].fillna('')\n",
"points['operator'] = points['operator'].fillna('Unknown').replace('', 'Unknown')\n",
"points['city'] = points['city'].fillna('Unknown').replace('', 'Unknown')\n",
"points['state'] = points['state'].fillna('UNK').replace('', 'UNK')\n",
"points['source'] = points['source'].fillna('unknown').replace('', 'unknown')\n",
"\n",
"if not point_context.empty:\n",
" context_cols = [c for c in point_context.columns if c != 'master_id']\n",
" points = points.merge(point_context[['master_id'] + context_cols], on='master_id', how='left')\n",
"\n",
"if not state_energy.empty:\n",
" state_cols = [c for c in state_energy.columns if c != 'state_code']\n",
" points = points.merge(state_energy[['state_code'] + state_cols], left_on='state', right_on='state_code', how='left')\n",
"\n",
"clusters['cluster_id'] = pd.to_numeric(clusters['cluster_id'], errors='coerce').astype(int)\n",
"clusters = clusters.sort_values(['point_count', 'radius_km_p90'], ascending=[False, True]).reset_index(drop=True)\n",
"clusters['cluster_rank'] = clusters.index + 1\n",
"\n",
"huc8_geojson = None\n",
"if HUC8_GEOJSON.exists():\n",
" huc8_geojson = json.loads(HUC8_GEOJSON.read_text())\n",
"\n",
"n_clusters = points.loc[points['cluster_id'].ne(-1), 'cluster_id'].nunique()\n",
"print(f'Loaded {len(points):,} points and {n_clusters:,} clusters')\n",
"print('point context columns:', 0 if point_context.empty else len(point_context.columns))\n",
"print('HUC8 features:', 0 if huc8_geojson is None else len(huc8_geojson.get('features', [])))\n",
"if not state_energy.empty:\n",
" seds_available = state_energy['seds_series_count'].notna().sum() if 'seds_series_count' in state_energy.columns else 0\n",
" print(f'state energy rows: {len(state_energy):,}; SEDS rows represented: {seds_available:,}')\n",
"else:\n",
" print('state energy context file not found')\n",
"display(points.head())\n",
"display(clusters.head(10))\n",
"if not state_energy.empty:\n",
" display(state_energy.head(10))\n"
]
},
{
"cell_type": "markdown",
"id": "6",
"metadata": {},
"source": [
"## Map Helpers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7",
"metadata": {},
"outputs": [],
"source": [
"CLUSTER_COLORS = [\n",
" '#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2',\n",
" '#be123c', '#4f46e5', '#65a30d', '#c026d3', '#0f766e', '#b45309',\n",
"]\n",
"NOISE_COLOR = '#9ca3af'\n",
"CENTROID_COLOR = '#111827'\n",
"STATE_ENERGY_COLOR = '#f59e0b'\n",
"\n",
"cluster_info = clusters.set_index('cluster_id').to_dict('index')\n",
"\n",
"\n",
"def clean_value(value):\n",
" if pd.isna(value):\n",
" return ''\n",
" return escape(str(value))\n",
"\n",
"\n",
"def fmt_number(value, decimals=0, prefix='', suffix=''):\n",
" if pd.isna(value):\n",
" return ''\n",
" try:\n",
" value = float(value)\n",
" except (TypeError, ValueError):\n",
" return clean_value(value)\n",
" return f\"{prefix}{value:,.{decimals}f}{suffix}\"\n",
"\n",
"\n",
"def cluster_color(cluster_id):\n",
" if cluster_id == -1:\n",
" return NOISE_COLOR\n",
" info = cluster_info.get(cluster_id, {})\n",
" rank = int(info.get('cluster_rank', cluster_id + 1))\n",
" return CLUSTER_COLORS[(rank - 1) % len(CLUSTER_COLORS)]\n",
"\n",
"\n",
"def cluster_label_and_size(cluster_id):\n",
" if cluster_id == -1:\n",
" return 'Noise / isolated', '1', ''\n",
" info = cluster_info.get(cluster_id, {})\n",
" rank = int(info.get('cluster_rank', cluster_id + 1))\n",
" point_count = int(info.get('point_count', 0))\n",
" return f'Cluster ID {cluster_id}', f'{point_count:,}', f'Rank {rank} of {n_clusters} by size'\n",
"\n",
"\n",
"def point_popup(row):\n",
" cluster_label, cluster_size, cluster_rank = cluster_label_and_size(row.cluster_id)\n",
" nearest = row.nearest_neighbor_km\n",
" nearest_text = f'{nearest:.2f} km' if pd.notna(nearest) else ''\n",
" title = clean_value(row.name) or clean_value(row.master_id)\n",
"\n",
" huc8_lines = ''\n",
" if hasattr(row, 'huc8') and pd.notna(row.huc8):\n",
" huc8_lines = f'''\n",
" <hr style=\"margin: 6px 0;\">\n",
" <strong>Watershed</strong><br>\n",
" HUC8: {clean_value(row.huc8)}<br>\n",
" Name: {clean_value(row.huc8_name)}<br>\n",
" States: {clean_value(row.huc8_states)}<br>\n",
" '''\n",
"\n",
" ruca_lines = ''\n",
" if hasattr(row, 'primary_ruca') and pd.notna(row.primary_ruca):\n",
" ruca_lines = f'''\n",
" <hr style=\"margin: 6px 0;\">\n",
" <strong>RUCA / tract context</strong><br>\n",
" RUCA band: {clean_value(row.ruca_band)}<br>\n",
" RUCA code: {fmt_number(row.primary_ruca)}<br>\n",
" {clean_value(row.primary_ruca_description)}<br>\n",
" Median HH income: {fmt_number(row.median_household_income, prefix='$')}<br>\n",
" Bachelor's+: {fmt_number(row.bachelor_or_higher_pct, 1, suffix='%')}<br>\n",
" Poverty: {fmt_number(row.poverty_rate, 1, suffix='%')}<br>\n",
" Non-Hispanic white: {fmt_number(row.non_hispanic_white_pct, 1, suffix='%')}<br>\n",
" '''\n",
"\n",
" energy_lines = ''\n",
" if hasattr(row, 'im3_projected_it_power_mw') and pd.notna(row.im3_projected_it_power_mw):\n",
" if hasattr(row, 'seds_series_count') and pd.notna(row.seds_series_count):\n",
" seds_note = f\"SEDS year: {fmt_number(row.seds_latest_year)}; series: {fmt_number(row.seds_series_count)}<br>\"\n",
" else:\n",
" seds_note = 'SEDS context: unavailable in seds_state_msn_year<br>'\n",
" energy_lines = f'''\n",
" <hr style=\"margin: 6px 0;\">\n",
" <strong>State energy demand context</strong><br>\n",
" IM3 projected IT power: {fmt_number(row.im3_projected_it_power_mw, suffix=' MW')}<br>\n",
" IM3 cooling water demand: {fmt_number(row.im3_cooling_water_demand_mgy, 1, suffix=' MGY')}<br>\n",
" IM3 water consumption: {fmt_number(row.im3_cooling_water_consumption_mgy, 1, suffix=' MGY')}<br>\n",
" IM3 avg siting score: {fmt_number(row.im3_avg_weighted_siting_score, 3)}<br>\n",
" {seds_note}\n",
" '''\n",
"\n",
" return folium.Popup(f'''\n",
" <div style=\"font-family: system-ui, sans-serif; min-width: 310px; max-width: 420px;\">\n",
" <strong>{title}</strong><br>\n",
" {clean_value(row.city)}, {clean_value(row.state)}<br>\n",
" <hr style=\"margin: 6px 0;\">\n",
" <strong>{cluster_label}</strong><br>\n",
" {cluster_rank}<br>\n",
" Cluster size: {cluster_size} data center(s)<br>\n",
" Source: {clean_value(row.source)}<br>\n",
" Operator: {clean_value(row.operator)}<br>\n",
" Nearest neighbor: {nearest_text}<br>\n",
" Master ID: {clean_value(row.master_id)}\n",
" {huc8_lines}\n",
" {ruca_lines}\n",
" {energy_lines}\n",
" </div>\n",
" ''', max_width=460)\n",
"\n",
"\n",
"def centroid_popup(row):\n",
" return folium.Popup(f'''\n",
" <div style=\"font-family: system-ui, sans-serif; min-width: 280px;\">\n",
" <strong>Cluster ID {int(row.cluster_id)}</strong><br>\n",
" Rank {int(row.cluster_rank)} of {n_clusters} by size<br>\n",
" <hr style=\"margin: 6px 0;\">\n",
" Points: {int(row.point_count):,}<br>\n",
" p50 radius: {row.radius_km_p50:.1f} km<br>\n",
" p90 radius: {row.radius_km_p90:.1f} km<br>\n",
" Max radius: {row.radius_km_max:.1f} km<br>\n",
" States: {clean_value(row.states)}<br>\n",
" Cities: {clean_value(row.cities)}<br>\n",
" Operators: {clean_value(row.operators)}\n",
" </div>\n",
" ''', max_width=420)\n",
"\n",
"\n",
"def huc8_style(feature):\n",
" count = feature['properties'].get('data_center_count') or 0\n",
" if count >= 100:\n",
" fill = '#075985'\n",
" elif count >= 50:\n",
" fill = '#0284c7'\n",
" elif count >= 20:\n",
" fill = '#38bdf8'\n",
" elif count >= 10:\n",
" fill = '#7dd3fc'\n",
" else:\n",
" fill = '#bae6fd'\n",
" return {'fillColor': fill, 'color': '#0369a1', 'weight': 1, 'fillOpacity': 0.22}\n",
"\n",
"\n",
"def huc8_popup(feature):\n",
" p = feature['properties']\n",
" return folium.Popup(f'''\n",
" <div style=\"font-family: system-ui, sans-serif; min-width: 280px;\">\n",
" <strong>{clean_value(p.get('name'))}</strong><br>\n",
" HUC8: {clean_value(p.get('huc8'))}<br>\n",
" States: {clean_value(p.get('states'))}<br>\n",
" <hr style=\"margin: 6px 0;\">\n",
" Data centers: {fmt_number(p.get('data_center_count'))}<br>\n",
" Clustered DCs: {fmt_number(p.get('clustered_data_center_count'))}<br>\n",
" Distinct clusters: {fmt_number(p.get('cluster_count'))}<br>\n",
" Area: {fmt_number(p.get('areasqkm'), 0, suffix=' sq km')}\n",
" </div>\n",
" ''', max_width=360)\n",
"\n",
"\n",
"def state_energy_popup(row):\n",
" if hasattr(row, 'seds_series_count') and pd.notna(row.seds_series_count):\n",
" seds_note = f\"SEDS latest year: {fmt_number(row.seds_latest_year)}; series: {fmt_number(row.seds_series_count)}\"\n",
" else:\n",
" seds_note = 'SEDS context: unavailable in seds_state_msn_year'\n",
" return folium.Popup(f'''\n",
" <div style=\"font-family: system-ui, sans-serif; min-width: 280px;\">\n",
" <strong>{clean_value(row.state_code)} state energy context</strong><br>\n",
" Current data centers: {fmt_number(row.current_data_center_count)}<br>\n",
" <hr style=\"margin: 6px 0;\">\n",
" IM3 projected sites: {fmt_number(row.im3_project_count)}<br>\n",
" IM3 projected IT power: {fmt_number(row.im3_projected_it_power_mw, suffix=' MW')}<br>\n",
" IM3 cooling water demand: {fmt_number(row.im3_cooling_water_demand_mgy, 1, suffix=' MGY')}<br>\n",
" IM3 water consumption: {fmt_number(row.im3_cooling_water_consumption_mgy, 1, suffix=' MGY')}<br>\n",
" IM3 avg siting score: {fmt_number(row.im3_avg_weighted_siting_score, 3)}<br>\n",
" {seds_note}\n",
" </div>\n",
" ''', max_width=380)\n"
]
},
{
"cell_type": "markdown",
"id": "8",
"metadata": {},
"source": [
"## Build The Map"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9",
"metadata": {},
"outputs": [],
"source": [
"def build_cluster_map(points_df: pd.DataFrame, clusters_df: pd.DataFrame) -> folium.Map:\n",
" m = folium.Map(location=MAP_CENTER, zoom_start=MAP_ZOOM, tiles=BASE_TILES, control_scale=True)\n",
" plugins.Fullscreen(position='topleft').add_to(m)\n",
" plugins.MeasureControl(position='topleft', primary_length_unit='kilometers').add_to(m)\n",
" plugins.MiniMap(toggle_display=True, minimized=True).add_to(m)\n",
"\n",
" huc8_layer = folium.FeatureGroup(name='HUC8 watersheds with data centers', show=False)\n",
" state_energy_layer = folium.FeatureGroup(name='State energy demand context (IM3 / SEDS)', show=False)\n",
" clustered_layer = folium.FeatureGroup(name='Data centers: clustered', show=True)\n",
" noise_layer = folium.FeatureGroup(name='Data centers: noise / isolated', show=True)\n",
" centroid_layer = folium.FeatureGroup(name='Cluster centroids and p90 radius', show=True)\n",
"\n",
" if SHOW_HUC8_LAYER and huc8_geojson is not None:\n",
" folium.GeoJson(\n",
" huc8_geojson,\n",
" name='HUC8 watersheds with data centers',\n",
" style_function=huc8_style,\n",
" highlight_function=lambda feature: {'weight': 3, 'fillOpacity': 0.35},\n",
" tooltip=folium.GeoJsonTooltip(\n",
" fields=['name', 'huc8', 'data_center_count', 'cluster_count'],\n",
" aliases=['HUC8', 'Code', 'Data centers', 'Clusters'],\n",
" localize=True,\n",
" sticky=False,\n",
" ),\n",
" popup=huc8_popup,\n",
" ).add_to(huc8_layer)\n",
"\n",
" if SHOW_STATE_ENERGY_LAYER and not state_energy.empty:\n",
" for row in state_energy.dropna(subset=['map_latitude', 'map_longitude']).itertuples(index=False):\n",
" power = getattr(row, 'im3_projected_it_power_mw')\n",
" radius = 6 if pd.isna(power) else max(6, min(28, 4 + float(power) ** 0.5 / 2.4))\n",
" folium.CircleMarker(\n",
" location=[row.map_latitude, row.map_longitude],\n",
" radius=radius,\n",
" color='#92400e',\n",
" fill=True,\n",
" fill_color=STATE_ENERGY_COLOR,\n",
" fill_opacity=0.55,\n",
" weight=1.5,\n",
" popup=state_energy_popup(row),\n",
" tooltip=f'{row.state_code}: IM3 {fmt_number(power, suffix=\" MW\")}',\n",
" ).add_to(state_energy_layer)\n",
"\n",
" bounds = []\n",
" for row in points_df.itertuples(index=False):\n",
" cluster_label, cluster_size, _ = cluster_label_and_size(row.cluster_id)\n",
" marker = folium.CircleMarker(\n",
" location=[row.latitude, row.longitude],\n",
" radius=NOISE_RADIUS if row.cluster_id == -1 else CLUSTERED_RADIUS,\n",
" color=cluster_color(row.cluster_id),\n",
" fill=True,\n",
" fill_opacity=0.75,\n",
" weight=1,\n",
" popup=point_popup(row),\n",
" tooltip=f'{cluster_label}; size={cluster_size}',\n",
" )\n",
" if row.cluster_id == -1:\n",
" marker.add_to(noise_layer)\n",
" else:\n",
" marker.add_to(clustered_layer)\n",
" bounds.append([row.latitude, row.longitude])\n",
"\n",
" for row in clusters_df.itertuples(index=False):\n",
" color = cluster_color(int(row.cluster_id))\n",
" location = [row.centroid_latitude, row.centroid_longitude]\n",
" if SHOW_CENTROID_P90_CIRCLES and pd.notna(row.radius_km_p90):\n",
" folium.Circle(\n",
" location=location,\n",
" radius=float(row.radius_km_p90) * 1000,\n",
" color=color,\n",
" fill=False,\n",
" weight=1,\n",
" opacity=0.45,\n",
" ).add_to(centroid_layer)\n",
" folium.CircleMarker(\n",
" location=location,\n",
" radius=CENTROID_RADIUS,\n",
" color=CENTROID_COLOR,\n",
" fill=True,\n",
" fill_color=color,\n",
" fill_opacity=0.95,\n",
" weight=2,\n",
" popup=centroid_popup(row),\n",
" tooltip=f'Cluster {int(row.cluster_id)} centroid; {int(row.point_count):,} points',\n",
" ).add_to(centroid_layer)\n",
"\n",
" huc8_layer.add_to(m)\n",
" state_energy_layer.add_to(m)\n",
" clustered_layer.add_to(m)\n",
" noise_layer.add_to(m)\n",
" centroid_layer.add_to(m)\n",
" folium.LayerControl(collapsed=False).add_to(m)\n",
" if bounds:\n",
" m.fit_bounds(bounds, padding=(20, 20))\n",
" return m\n",
"\n",
"\n",
"cluster_map = build_cluster_map(points, clusters)\n",
"cluster_map\n"
]
},
{
"cell_type": "markdown",
"id": "10",
"metadata": {},
"source": [
"## Export HTML"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11",
"metadata": {},
"outputs": [],
"source": [
"cluster_map.save(MAP_HTML)\n",
"print('Wrote:', MAP_HTML.resolve())"
]
},
{
"cell_type": "markdown",
"id": "12",
"metadata": {},
"source": [
"## Feature Staging Area\n",
"\n",
"Tell me what you want to add next and I will build it here. Good candidates:\n",
"- filters by source/operator/state/cluster size\n",
"- toggle layers for top-N clusters\n",
"- water-stress overlays on top of the HUC8 layer\n",
"- generator capacity / fuel mix overlays around each DC\n",
"- opposition cases overlay\n",
"- cluster labels or summary panels\n",
"- downloadable GeoJSON exports\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}