updated map and cluster analysis

This commit is contained in:
2026-05-18 08:50:22 -07:00
parent aef9d99e18
commit dda6490023
9 changed files with 56139 additions and 55950 deletions

View File

@@ -302,6 +302,7 @@
"# LEFT JOIN master_data_center_spatial_clusters (c) ON master_id\n",
"# LEFT JOIN _dc_census_tract_acs_2024 (acs) ON m.geoid = acs.geoid\n",
"# LEFT JOIN ruca_codes_2020_tract (ruca) ON m.geoid = ruca.tract_fips_20\n",
"# LEFT JOIN watershed_huc8 (w) ON ST_Contains(w.geom, m.geom)\n",
"# LEFT JOIN (EIA operating generators within RADIUS_KM, latest period) aggregated per DC\n",
"#\n",
"# Energy aggregation: latest period, status='OP', sum of nameplate_capacity_mw\n",
@@ -357,6 +358,8 @@
" ruca.urban_core, ruca.urban_core_type,\n",
" ruca.pop_density as tract_pop_density,\n",
" ruca.land_area as tract_land_area_sqmi,\n",
" w.huc8, w.name as huc8_name, w.states as huc8_states,\n",
" w.areasqkm as huc8_area_sqkm,\n",
" coalesce(en.eia_gen_count, 0) as eia_gen_count,\n",
" coalesce(en.eia_plant_count, 0) as eia_plant_count,\n",
" coalesce(en.eia_capacity_mw, 0) as eia_capacity_mw,\n",
@@ -371,6 +374,7 @@
"left join public.master_data_center_spatial_clusters c on c.master_id = m.master_id\n",
"left join public._dc_census_tract_acs_2024 acs on acs.geoid = m.geoid\n",
"left join public.ruca_codes_2020_tract ruca on ruca.tract_fips_20 = m.geoid\n",
"left join public.watershed_huc8 w on m.geom is not null and st_contains(w.geom, m.geom)\n",
"left join energy_nearby en on en.master_id = m.master_id\n",
"\"\"\"\n",
"\n",
@@ -381,6 +385,7 @@
"print('non-null geoid: ', joined_df['geoid'].notna().sum())\n",
"print('non-null cluster_id: ', joined_df['cluster_id'].notna().sum())\n",
"print('non-null primary_ruca: ', joined_df['primary_ruca'].notna().sum())\n",
"print('non-null huc8: ', joined_df['huc8'].notna().sum())\n",
"print('DCs with >=1 nearby gen: ', (joined_df['eia_gen_count'] > 0).sum())\n",
"print(f\"median nearby capacity: {joined_df['eia_capacity_mw'].median():,.0f} MW\")\n",
"print(f\" 90th percentile: {joined_df['eia_capacity_mw'].quantile(0.9):,.0f} MW\")\n",
@@ -815,6 +820,102 @@
"print(\"Non-metro DCs by operator group, sized by aggregate nearby grid capacity:\")\n",
"hyperscaler_view\n"
]
},
{
"cell_type": "markdown",
"id": "21",
"metadata": {},
"source": [
"## HUC8 watershed concentration\n",
"\n",
"Each DC sits in exactly one HUC8 watershed (USGS 8-digit hydrologic unit, roughly subbasin scale, median area ~3,000 sq km). Watershed concentration matters because cooling water draw and wastewater discharge happen at the watershed scale, not the state scale — and a single watershed feeds finite reservoirs and aquifers.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22",
"metadata": {},
"outputs": [],
"source": [
"# Top watersheds by DC count, with demographic + energy context.\n",
"watershed_summary = (\n",
" joined_df[joined_df['huc8'].notna()]\n",
" .groupby(['huc8', 'huc8_name', 'huc8_states'], dropna=False)\n",
" .agg(\n",
" dcs=('master_id', 'count'),\n",
" states_in_dc=('state', lambda s: ','.join(sorted(s.dropna().unique()))),\n",
" operators=('operator', lambda s: s.dropna().nunique()),\n",
" med_pop_density=('tract_pop_density', 'median'),\n",
" sum_eia_capacity_mw=('eia_capacity_mw', 'sum'),\n",
" med_eia_capacity_mw=('eia_capacity_mw', 'median'),\n",
" huc8_area_sqkm=('huc8_area_sqkm', 'first'),\n",
" )\n",
" .reset_index()\n",
" .sort_values('dcs', ascending=False)\n",
" .round(0)\n",
")\n",
"watershed_summary['dcs_per_1000_sqkm'] = (1000 * watershed_summary['dcs'] /\n",
" watershed_summary['huc8_area_sqkm']).round(2)\n",
"\n",
"print(f\"DCs match {joined_df['huc8'].notna().sum():,} HUC8 watersheds\")\n",
"print(f\"Distinct HUC8s holding DCs: {watershed_summary['huc8'].nunique():,}\\n\")\n",
"print(\"Top 15 watersheds by DC count:\")\n",
"watershed_summary.head(15).reset_index(drop=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23",
"metadata": {},
"outputs": [],
"source": [
"# Concentration: what share of all DCs sit in the top N watersheds?\n",
"# (A water-scarcity concentration story — if a single basin is stressed, how\n",
"# much of the national DC footprint feels it?)\n",
"ws_sorted = (watershed_summary.sort_values('dcs', ascending=False)\n",
" .reset_index(drop=True))\n",
"ws_sorted['cumulative_dcs'] = ws_sorted['dcs'].cumsum()\n",
"total_dcs = int(ws_sorted['dcs'].sum())\n",
"ws_sorted['cumulative_pct'] = (100 * ws_sorted['cumulative_dcs'] / total_dcs).round(1)\n",
"\n",
"print('Watershed concentration of US data centers:')\n",
"for n in [1, 2, 3, 5, 10, 15, 20, 30, 50, 100]:\n",
" cum = int(ws_sorted.head(n)['dcs'].sum())\n",
" print(f' Top {n:3d} watersheds: {cum:>5,} DCs ({100*cum/total_dcs:5.1f}% of all US DCs)')\n",
"print(f'\\nTotal HUC8s with at least one DC: {len(ws_sorted)}')\n",
"print(f'Total US HUC8 watersheds: 2,139')\n",
"print(f'Fraction touched by any DC: {100*len(ws_sorted)/2139:.1f}%')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24",
"metadata": {},
"outputs": [],
"source": [
"# Non-metro watershed concentration: where are the hyperscale rural builds clustered\n",
"# at the watershed scale?\n",
"nm_ws = (\n",
" joined_df[joined_df['primary_ruca'].isin([4,5,6,7,8,9,10]) & joined_df['huc8'].notna()]\n",
" .groupby(['huc8', 'huc8_name', 'huc8_states'])\n",
" .agg(\n",
" dcs=('master_id', 'count'),\n",
" states_in_dc=('state', lambda s: ','.join(sorted(s.dropna().unique()))),\n",
" top_operators=('operator', lambda s: ','.join(sorted(s.dropna().unique())[:4])),\n",
" med_eia_capacity_mw=('eia_capacity_mw', 'median'),\n",
" sum_eia_hydro_mw=('eia_capacity_hydro', 'sum'),\n",
" sum_eia_wind_mw=('eia_capacity_wind', 'sum'),\n",
" )\n",
" .reset_index()\n",
" .sort_values('dcs', ascending=False)\n",
" .round(0)\n",
")\n",
"print('Top non-metro watersheds (RUCA 4-10):')\n",
"nm_ws.head(15).reset_index(drop=True)\n"
]
}
],
"metadata": {