Files
data-centers/cluster_analysis.ipynb
dadams eccfbdbad9 Add data center demographic, RUCA, and energy capacity analysis
Adds three coordinated changes:

- Request nameplate, summer, and winter capacity from the EIA
  operating-generator-capacity endpoint and project them as typed columns
  on energy_eia_operating_generator_capacity_flat. The original ingest
  only pulled latitude and longitude, leaving the flat table with no MW
  values despite its name.
- New cluster_analysis.ipynb joins master_data_centers to ACS-2024
  demographics, USDA RUCA-2020 codes (loaded from new/), and EIA
  generation capacity within 50 km of each site.
- Summary doc consolidates the headline findings: DC tracts skew higher
  income / more educated / more racially diverse than US average, the
  metro over-index is only 1.11x, the non-metro tail is dominated by
  hyperscalers in the Columbia River corridor (OR+WA = 66% of non-metro
  DCs), and Microsoft co-locates with Palo Verde Nuclear in Goodyear AZ.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 08:14:57 -07:00

842 lines
34 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "0",
"metadata": {},
"source": [
"# Clustering Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"import psycopg2\n",
"\n",
"\n",
"def load_env_file(env_path: str = '.env') -> None:\n",
" p = Path(env_path)\n",
" if not p.exists():\n",
" print(f'No {env_path} file found in {Path.cwd()}')\n",
" return\n",
" loaded = 0\n",
" for raw_line in p.read_text(encoding='utf-8').splitlines():\n",
" line = raw_line.strip()\n",
" if not line or line.startswith('#') or '=' not in line:\n",
" continue\n",
" key, value = line.split('=', 1)\n",
" key = key.strip()\n",
" value = value.strip().strip('\"').strip(\"'\")\n",
" if key and key not in os.environ:\n",
" os.environ[key] = value\n",
" loaded += 1\n",
" print(f'Loaded {loaded} env var(s) from {env_path}')\n",
"\n",
"\n",
"def require_env(keys):\n",
" missing = [k for k in keys if not os.getenv(k)]\n",
" if missing:\n",
" raise EnvironmentError(\n",
" 'Missing required env vars: ' + ', '.join(missing) +\n",
" '.\\nSet them in this notebook, or add them to a .env file.'\n",
" )\n",
"\n",
"\n",
"load_env_file('.env')\n",
"\n",
"required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n",
"require_env(required_keys)\n",
"\n",
"DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n",
"\n",
"\n",
"def get_conn():\n",
" return psycopg2.connect(\n",
" host=os.environ['PGWEB_HOST'],\n",
" port=os.environ['PGWEB_PORT'],\n",
" user=os.environ['PGWEB_USER'],\n",
" password=os.environ['PGWEB_PASSWORD'],\n",
" dbname='data_centers',\n",
" )\n",
"\n",
"\n",
"with get_conn() as conn:\n",
" with conn.cursor() as cur:\n",
" cur.execute('select current_database(), current_user, version()')\n",
" db, usr, ver = cur.fetchone()\n",
" print('Connected to DB:', db)\n",
" print('As user:', usr)\n",
" print('Postgres:', ver.split(',')[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {},
"outputs": [],
"source": [
"# List tables in the database (user schemas only, excluding system + PostGIS internals).\n",
"TABLES_SQL = \"\"\"\n",
"select\n",
" table_schema,\n",
" table_name,\n",
" table_type\n",
"from information_schema.tables\n",
"where table_schema not in ('pg_catalog', 'information_schema', 'tiger', 'tiger_data', 'topology')\n",
"order by table_schema, table_name\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" tables_df = pd.read_sql(TABLES_SQL, conn)\n",
"\n",
"print(f'{len(tables_df):,} tables/views found')\n",
"tables_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3",
"metadata": {},
"outputs": [],
"source": [
"# Inspect columns for the tables we want to join.\n",
"INSPECT_TABLES = [\n",
" 'master_data_centers',\n",
" 'master_data_center_spatial_clusters',\n",
" 'data_center_census_tracts_2024',\n",
" '_dc_census_tract_acs_2024',\n",
" 'energy_eia_operating_generator_capacity_flat',\n",
"]\n",
"\n",
"COLS_SQL = \"\"\"\n",
"select table_name, column_name, data_type\n",
"from information_schema.columns\n",
"where table_schema = 'public' and table_name = any(%s)\n",
"order by table_name, ordinal_position\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" cols_df = pd.read_sql(COLS_SQL, conn, params=(INSPECT_TABLES,))\n",
"\n",
"for t in INSPECT_TABLES:\n",
" sub = cols_df[cols_df['table_name'] == t]\n",
" print(f'\\n=== {t} ({len(sub)} cols) ===')\n",
" print(sub[['column_name', 'data_type']].to_string(index=False))\n"
]
},
{
"cell_type": "markdown",
"id": "4",
"metadata": {},
"source": [
"## Ingest RUCA codes\n",
"\n",
"USDA Rural-Urban Commuting Area (RUCA) codes classify each census tract on a 110 scale from \"Metropolitan area core\" (1) to \"Rural area\" (10), based on population density and commuting flows. Source file: `new/RUCA-codes-2020-tract.csv` (~85K tracts).\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"# Push RUCA codes CSV -> public.ruca_codes_2020_tract (idempotent: drops + recreates).\n",
"from psycopg2.extras import execute_values\n",
"\n",
"RUCA_CSV = Path('new/RUCA-codes-2020-tract.csv')\n",
"RUCA_TABLE = 'public.ruca_codes_2020_tract'\n",
"\n",
"# Map source CSV columns -> snake_case DB columns.\n",
"COL_MAP = {\n",
" 'TractFIPS23': 'tract_fips_23',\n",
" 'CountyFIPS23': 'county_fips_23',\n",
" 'CountyCode23': 'county_code_23',\n",
" 'CountyName23': 'county_name_23',\n",
" 'TractFIPS20': 'tract_fips_20',\n",
" 'TractCode20': 'tract_code_20',\n",
" 'TractName20': 'tract_name_20',\n",
" 'CountyFIPS20': 'county_fips_20',\n",
" 'CountyCode20': 'county_code_20',\n",
" 'CountyName20': 'county_name_20',\n",
" 'StateFIPS20': 'state_fips_20',\n",
" 'StateName20': 'state_name_20',\n",
" 'UrbanAreaCode20': 'urban_area_code_20',\n",
" 'UrbanAreaName20': 'urban_area_name_20',\n",
" 'UrbanCore': 'urban_core',\n",
" 'UrbanCoreType': 'urban_core_type',\n",
" 'PrimaryRUCA': 'primary_ruca',\n",
" 'PrimaryRUCADescription': 'primary_ruca_description',\n",
" 'PrimaryDestinationCode': 'primary_destination_code',\n",
" 'PrimaryDestinationName': 'primary_destination_name',\n",
" 'SecondaryRUCA': 'secondary_ruca',\n",
" 'SecondaryRUCADescription': 'secondary_ruca_description',\n",
" 'SecondaryDestinationCode': 'secondary_destination_code',\n",
" 'SecondaryDestinationName': 'secondary_destination_name',\n",
" 'Population': 'population',\n",
" 'LandArea': 'land_area',\n",
" 'PopDensity': 'pop_density',\n",
"}\n",
"\n",
"# File is Latin-1 (has bytes like 0xf1 = ñ from Spanish place names).\n",
"fips_str_cols = [c for c in COL_MAP if 'FIPS' in c or 'Code' in c]\n",
"ruca_df = pd.read_csv(\n",
" RUCA_CSV,\n",
" dtype={c: str for c in fips_str_cols},\n",
" encoding='latin-1',\n",
")\n",
"ruca_df = ruca_df.rename(columns=COL_MAP)\n",
"print(f'CSV rows: {len(ruca_df):,} cols: {ruca_df.shape[1]}')\n",
"\n",
"# PK is tract_fips_20 (always populated). Some tracts that existed in 2020 are gone\n",
"# in 2023 (water-only tracts, dissolves), so tract_fips_23 can be null.\n",
"DDL = f\"\"\"\n",
"drop table if exists {RUCA_TABLE};\n",
"create table {RUCA_TABLE} (\n",
" tract_fips_23 text,\n",
" county_fips_23 text,\n",
" county_code_23 text,\n",
" county_name_23 text,\n",
" tract_fips_20 text primary key,\n",
" tract_code_20 text,\n",
" tract_name_20 text,\n",
" county_fips_20 text,\n",
" county_code_20 text,\n",
" county_name_20 text,\n",
" state_fips_20 text,\n",
" state_name_20 text,\n",
" urban_area_code_20 text,\n",
" urban_area_name_20 text,\n",
" urban_core smallint,\n",
" urban_core_type text,\n",
" primary_ruca smallint,\n",
" primary_ruca_description text,\n",
" primary_destination_code text,\n",
" primary_destination_name text,\n",
" secondary_ruca text,\n",
" secondary_ruca_description text,\n",
" secondary_destination_code text,\n",
" secondary_destination_name text,\n",
" population integer,\n",
" land_area double precision,\n",
" pop_density double precision\n",
");\n",
"create index ruca_codes_2020_tract_state_idx on {RUCA_TABLE} (state_fips_20);\n",
"create index ruca_codes_2020_tract_primary_ruca_idx on {RUCA_TABLE} (primary_ruca);\n",
"create index ruca_codes_2020_tract_fips_23_idx on {RUCA_TABLE} (tract_fips_23);\n",
"\"\"\"\n",
"\n",
"cols = list(COL_MAP.values())\n",
"records = [tuple(None if pd.isna(v) else v for v in row) for row in ruca_df[cols].itertuples(index=False, name=None)]\n",
"\n",
"with get_conn() as conn:\n",
" with conn.cursor() as cur:\n",
" cur.execute(DDL)\n",
" execute_values(\n",
" cur,\n",
" f\"insert into {RUCA_TABLE} ({', '.join(cols)}) values %s\",\n",
" records,\n",
" page_size=2000,\n",
" )\n",
" cur.execute(f'select count(*) from {RUCA_TABLE}')\n",
" print(f'{RUCA_TABLE}: {cur.fetchone()[0]:,} rows')\n",
" cur.execute(f\"\"\"\n",
" select primary_ruca, count(*) as n\n",
" from {RUCA_TABLE}\n",
" group by 1 order by 1\n",
" \"\"\")\n",
" print('\\nRUCA distribution (all US tracts):')\n",
" for ruca, n in cur.fetchall():\n",
" print(f' {ruca}: {n:>6,}')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6",
"metadata": {},
"outputs": [],
"source": [
"# Sanity-check EIA generator coordinate ranges.\n",
"# Prior note: EIA source data had a longitude sign error. Verify before spatial joining.\n",
"EIA_COORD_SQL = \"\"\"\n",
"select\n",
" min(longitude) as lon_min, max(longitude) as lon_max,\n",
" min(latitude) as lat_min, max(latitude) as lat_max,\n",
" count(*) filter (where longitude > 0) as pos_lon_rows,\n",
" count(*) filter (where longitude < 0) as neg_lon_rows,\n",
" count(*) as total_rows\n",
"from public.energy_eia_operating_generator_capacity_flat\n",
"where longitude is not null and latitude is not null\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" eia_coord_df = pd.read_sql(EIA_COORD_SQL, conn)\n",
"\n",
"print(eia_coord_df.T)\n",
"# For US plants we expect longitude in roughly [-180, -65]. If pos_lon_rows is large,\n",
"# the sign-flip correction is still needed when spatial-joining.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7",
"metadata": {},
"outputs": [],
"source": [
"# Build the joined analysis dataset.\n",
"#\n",
"# Joins:\n",
"# master_data_centers (m)\n",
"# LEFT JOIN master_data_center_spatial_clusters (c) ON master_id\n",
"# LEFT JOIN _dc_census_tract_acs_2024 (acs) ON m.geoid = acs.geoid\n",
"# LEFT JOIN ruca_codes_2020_tract (ruca) ON m.geoid = ruca.tract_fips_20\n",
"# LEFT JOIN (EIA operating generators within RADIUS_KM, latest period) aggregated per DC\n",
"#\n",
"# Energy aggregation: latest period, status='OP', sum of nameplate_capacity_mw\n",
"# (and counts) within RADIUS_KM, broken out by fuel.\n",
"\n",
"RADIUS_KM = 50\n",
"\n",
"JOIN_SQL = f\"\"\"\n",
"with latest_period as (\n",
" select max(period) as period\n",
" from public.energy_eia_operating_generator_capacity_flat\n",
"),\n",
"eia_latest as (\n",
" select e.plant_id, e.generator_id, e.energy_source_code,\n",
" e.nameplate_capacity_mw, e.geom\n",
" from public.energy_eia_operating_generator_capacity_flat e\n",
" join latest_period lp on e.period = lp.period\n",
" where e.status = 'OP' and e.geom is not null\n",
"),\n",
"energy_nearby as (\n",
" select\n",
" m.master_id,\n",
" count(*) as eia_gen_count,\n",
" count(distinct plant_id) as eia_plant_count,\n",
" sum(nameplate_capacity_mw) as eia_capacity_mw,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'NG') as eia_capacity_ng,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code in ('BIT','SUB','LIG','RC','ANT')) as eia_capacity_coal,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'NUC') as eia_capacity_nuclear,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'SUN') as eia_capacity_solar,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'WND') as eia_capacity_wind,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'WAT') as eia_capacity_hydro,\n",
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'GEO') as eia_capacity_geothermal\n",
" from public.master_data_centers m\n",
" join eia_latest e\n",
" on st_dwithin(m.geom::geography, e.geom::geography, {RADIUS_KM} * 1000)\n",
" where m.geom is not null\n",
" group by m.master_id\n",
")\n",
"select\n",
" m.master_id, m.name, m.operator, m.city, m.state, m.country,\n",
" m.power_mw, m.area_sqft, m.longitude, m.latitude, m.geoid,\n",
" c.cluster_id, c.is_noise, c.nearest_neighbor_km,\n",
" acs.population, acs.median_age, acs.households, acs.avg_household_size,\n",
" acs.median_household_income, acs.per_capita_income,\n",
" acs.poverty_rate, acs.unemployment_rate,\n",
" acs.bachelor_or_higher_pct, acs.broadband_subscription_pct,\n",
" acs.hispanic_latino_pct, acs.hispanic_latino_population,\n",
" acs.non_hispanic_white_pct, acs.non_hispanic_white_population,\n",
" acs.non_hispanic_black_pct, acs.non_hispanic_black_population,\n",
" acs.non_hispanic_asian_pct, acs.non_hispanic_asian_population,\n",
" acs.primary_industry, acs.primary_industry_pct,\n",
" ruca.primary_ruca, ruca.primary_ruca_description,\n",
" ruca.urban_core, ruca.urban_core_type,\n",
" ruca.pop_density as tract_pop_density,\n",
" ruca.land_area as tract_land_area_sqmi,\n",
" coalesce(en.eia_gen_count, 0) as eia_gen_count,\n",
" coalesce(en.eia_plant_count, 0) as eia_plant_count,\n",
" coalesce(en.eia_capacity_mw, 0) as eia_capacity_mw,\n",
" coalesce(en.eia_capacity_ng, 0) as eia_capacity_ng,\n",
" coalesce(en.eia_capacity_coal, 0) as eia_capacity_coal,\n",
" coalesce(en.eia_capacity_nuclear, 0) as eia_capacity_nuclear,\n",
" coalesce(en.eia_capacity_solar, 0) as eia_capacity_solar,\n",
" coalesce(en.eia_capacity_wind, 0) as eia_capacity_wind,\n",
" coalesce(en.eia_capacity_hydro, 0) as eia_capacity_hydro,\n",
" coalesce(en.eia_capacity_geothermal, 0) as eia_capacity_geothermal\n",
"from public.master_data_centers m\n",
"left join public.master_data_center_spatial_clusters c on c.master_id = m.master_id\n",
"left join public._dc_census_tract_acs_2024 acs on acs.geoid = m.geoid\n",
"left join public.ruca_codes_2020_tract ruca on ruca.tract_fips_20 = m.geoid\n",
"left join energy_nearby en on en.master_id = m.master_id\n",
"\"\"\"\n",
"\n",
"with get_conn() as conn:\n",
" joined_df = pd.read_sql(JOIN_SQL, conn)\n",
"\n",
"print(f'rows: {len(joined_df):,} cols: {joined_df.shape[1]}')\n",
"print('non-null geoid: ', joined_df['geoid'].notna().sum())\n",
"print('non-null cluster_id: ', joined_df['cluster_id'].notna().sum())\n",
"print('non-null primary_ruca: ', joined_df['primary_ruca'].notna().sum())\n",
"print('DCs with >=1 nearby gen: ', (joined_df['eia_gen_count'] > 0).sum())\n",
"print(f\"median nearby capacity: {joined_df['eia_capacity_mw'].median():,.0f} MW\")\n",
"print(f\" 90th percentile: {joined_df['eia_capacity_mw'].quantile(0.9):,.0f} MW\")\n",
"print(f\" max: {joined_df['eia_capacity_mw'].max():,.0f} MW\")\n",
"joined_df.head()\n"
]
},
{
"cell_type": "markdown",
"id": "8",
"metadata": {},
"source": [
"## Quick demographic analysis\n",
"\n",
"The joined dataset has one row per data center, enriched with the demographics of its containing census tract. Note that multiple DCs can share a tract, so tract-level stats are weighted by DC count in these summaries (i.e. \"the average DC sits in a tract with...\" rather than \"the average DC-hosting tract has...\").\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9",
"metadata": {},
"outputs": [],
"source": [
"# Top-line demographic profile of the average DC's containing tract.\n",
"demo_cols = [\n",
" 'population', 'median_age', 'avg_household_size',\n",
" 'median_household_income', 'per_capita_income',\n",
" 'poverty_rate', 'unemployment_rate',\n",
" 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n",
" 'hispanic_latino_pct', 'non_hispanic_white_pct',\n",
" 'non_hispanic_black_pct', 'non_hispanic_asian_pct',\n",
"]\n",
"demo_cols = [c for c in demo_cols if c in joined_df.columns]\n",
"\n",
"summary = joined_df[demo_cols].agg(['count', 'mean', 'median', 'std', 'min', 'max']).round(2).T\n",
"summary.columns = ['n', 'mean', 'median', 'std', 'min', 'max']\n",
"\n",
"# US national benchmarks (ACS 5-yr ~2024) for context\n",
"benchmarks = {\n",
" 'median_household_income': 78_538,\n",
" 'per_capita_income': 43_313,\n",
" 'poverty_rate': 12.4,\n",
" 'unemployment_rate': 5.4,\n",
" 'bachelor_or_higher_pct': 35.0,\n",
" 'broadband_subscription_pct': 89.0,\n",
" 'hispanic_latino_pct': 19.5,\n",
" 'non_hispanic_white_pct': 58.4,\n",
" 'non_hispanic_black_pct': 12.1,\n",
" 'non_hispanic_asian_pct': 6.4,\n",
"}\n",
"summary['us_avg'] = pd.Series(benchmarks).reindex(summary.index)\n",
"summary['vs_us'] = (summary['mean'] - summary['us_avg']).round(2)\n",
"summary\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10",
"metadata": {},
"outputs": [],
"source": [
"# Geographic concentration: where are the data centers, and what do those places look like?\n",
"state_summary = (\n",
" joined_df.groupby('state', dropna=False)\n",
" .agg(\n",
" dc_count=('master_id', 'count'),\n",
" avg_power_mw=('power_mw', 'mean'),\n",
" total_power_mw=('power_mw', 'sum'),\n",
" median_hh_income=('median_household_income', 'median'),\n",
" median_poverty=('poverty_rate', 'median'),\n",
" median_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n",
" median_broadband_pct=('broadband_subscription_pct', 'median'),\n",
" median_pct_white=('non_hispanic_white_pct', 'median'),\n",
" median_pct_hispanic=('hispanic_latino_pct', 'median'),\n",
" median_pct_black=('non_hispanic_black_pct', 'median'),\n",
" )\n",
" .sort_values('dc_count', ascending=False)\n",
" .round(1)\n",
")\n",
"print(f'{joined_df[\"state\"].nunique()} states/territories represented')\n",
"state_summary.head(15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11",
"metadata": {},
"outputs": [],
"source": [
"# Cluster vs. non-cluster: do DCs in spatial clusters sit in different demographic settings\n",
"# than isolated ones? (cluster_id is null/is_noise=True for unclustered DCs.)\n",
"joined_df['in_cluster'] = joined_df['cluster_id'].notna() & (joined_df['is_noise'] != True)\n",
"\n",
"compare_cols = [\n",
" 'median_household_income', 'per_capita_income',\n",
" 'poverty_rate', 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n",
" 'non_hispanic_white_pct', 'hispanic_latino_pct', 'non_hispanic_black_pct',\n",
" 'population', 'eia_gen_count',\n",
"]\n",
"compare_cols = [c for c in compare_cols if c in joined_df.columns]\n",
"\n",
"cluster_compare = (\n",
" joined_df.groupby('in_cluster')[compare_cols]\n",
" .median()\n",
" .round(1)\n",
" .T\n",
" .rename(columns={False: 'isolated', True: 'in_cluster'})\n",
")\n",
"cluster_compare['delta'] = (cluster_compare['in_cluster'] - cluster_compare['isolated']).round(1)\n",
"print(f\"DCs in a cluster: {joined_df['in_cluster'].sum():,} isolated: {(~joined_df['in_cluster']).sum():,}\")\n",
"cluster_compare\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12",
"metadata": {},
"outputs": [],
"source": [
"# Quick visual sweep: distribution of key demographic features for DC tracts,\n",
"# with US-average reference lines for context.\n",
"import matplotlib.pyplot as plt\n",
"\n",
"panels = [\n",
" ('median_household_income', 78_538, 'Median household income (USD)'),\n",
" ('poverty_rate', 12.4, 'Poverty rate (%)'),\n",
" ('bachelor_or_higher_pct', 35.0, \"Bachelor's degree or higher (%)\"),\n",
" ('broadband_subscription_pct', 89.0, 'Broadband subscription (%)'),\n",
" ('non_hispanic_white_pct', 58.4, 'Non-Hispanic white (%)'),\n",
" ('hispanic_latino_pct', 19.5, 'Hispanic/Latino (%)'),\n",
"]\n",
"panels = [(c, b, lab) for c, b, lab in panels if c in joined_df.columns]\n",
"\n",
"fig, axes = plt.subplots(2, 3, figsize=(15, 8))\n",
"for ax, (col, bench, label) in zip(axes.ravel(), panels):\n",
" s = joined_df[col].dropna()\n",
" ax.hist(s, bins=40, color='steelblue', edgecolor='white', alpha=0.85)\n",
" ax.axvline(s.median(), color='darkorange', linestyle='-', lw=2, label=f'DC median = {s.median():.1f}')\n",
" ax.axvline(bench, color='firebrick', linestyle='--', lw=2, label=f'US avg = {bench}')\n",
" ax.set_title(label)\n",
" ax.legend(fontsize=8)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "13",
"metadata": {},
"source": [
"## RUCA (urban / rural) analysis\n",
"\n",
"RUCA primary code key:\n",
"- **1**: Metropolitan area core\n",
"- **2**: Metropolitan area high commuting\n",
"- **3**: Metropolitan area low commuting\n",
"- **46**: Micropolitan area (small city + commuting tracts)\n",
"- **79**: Small town (core + commuting tracts)\n",
"- **10**: Rural area\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14",
"metadata": {},
"outputs": [],
"source": [
"# DC distribution across RUCA codes vs. the national baseline of all US tracts.\n",
"ruca_buckets = {\n",
" 1: 'Metro core', 2: 'Metro high-commute', 3: 'Metro low-commute',\n",
" 4: 'Micro core', 5: 'Micro high-commute', 6: 'Micro low-commute',\n",
" 7: 'Small town core', 8: 'Small town high-commute', 9: 'Small town low-commute',\n",
" 10: 'Rural',\n",
"}\n",
"\n",
"def ruca_band(r):\n",
" if pd.isna(r): return 'Unknown'\n",
" r = int(r)\n",
" if r <= 3: return 'Metropolitan'\n",
" if r <= 6: return 'Micropolitan'\n",
" if r <= 9: return 'Small town'\n",
" return 'Rural'\n",
"\n",
"dc_ruca = joined_df.copy()\n",
"dc_ruca['ruca_label'] = dc_ruca['primary_ruca'].map(ruca_buckets)\n",
"dc_ruca['ruca_band'] = dc_ruca['primary_ruca'].apply(ruca_band)\n",
"\n",
"# National baseline (share of US tracts in each band).\n",
"NATIONAL_SQL = \"\"\"\n",
"select\n",
" case\n",
" when primary_ruca between 1 and 3 then 'Metropolitan'\n",
" when primary_ruca between 4 and 6 then 'Micropolitan'\n",
" when primary_ruca between 7 and 9 then 'Small town'\n",
" when primary_ruca = 10 then 'Rural'\n",
" else 'Unknown'\n",
" end as ruca_band,\n",
" count(*) as tracts\n",
"from public.ruca_codes_2020_tract\n",
"group by 1\n",
"\"\"\"\n",
"with get_conn() as conn:\n",
" national_df = pd.read_sql(NATIONAL_SQL, conn)\n",
"national_df['tracts_pct'] = (100 * national_df['tracts'] / national_df['tracts'].sum()).round(1)\n",
"\n",
"dc_by_band = (\n",
" dc_ruca.groupby('ruca_band').size().rename('dcs').to_frame()\n",
" .assign(dcs_pct=lambda d: (100 * d['dcs'] / d['dcs'].sum()).round(1))\n",
")\n",
"band_compare = dc_by_band.join(national_df.set_index('ruca_band')[['tracts', 'tracts_pct']])\n",
"band_compare['over_index'] = (band_compare['dcs_pct'] / band_compare['tracts_pct']).round(2)\n",
"print('Data centers vs. all US tracts, by RUCA band:')\n",
"print(band_compare.reindex(['Metropolitan', 'Micropolitan', 'Small town', 'Rural', 'Unknown']).fillna(0))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15",
"metadata": {},
"outputs": [],
"source": [
"# Fine-grained RUCA breakdown: DC count, median power, demographics, energy infra\n",
"# at each of the 10 RUCA codes.\n",
"ruca_profile = (\n",
" dc_ruca.groupby('primary_ruca', dropna=False)\n",
" .agg(\n",
" dcs=('master_id', 'count'),\n",
" median_power_mw=('power_mw', 'median'),\n",
" total_power_mw=('power_mw', 'sum'),\n",
" med_hh_income=('median_household_income', 'median'),\n",
" med_poverty=('poverty_rate', 'median'),\n",
" med_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n",
" med_pct_white=('non_hispanic_white_pct', 'median'),\n",
" med_pct_black=('non_hispanic_black_pct', 'median'),\n",
" med_pct_hispanic=('hispanic_latino_pct', 'median'),\n",
" med_pop_density=('tract_pop_density', 'median'),\n",
" med_eia_gens_50km=('eia_gen_count', 'median'),\n",
" )\n",
" .round(1)\n",
")\n",
"ruca_profile.insert(0, 'description', ruca_profile.index.map(ruca_buckets))\n",
"print('Per-RUCA-code profile of data centers:')\n",
"ruca_profile\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16",
"metadata": {},
"outputs": [],
"source": [
"# Plot: DC count by RUCA band vs. national tract share.\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
"\n",
"order = ['Metropolitan', 'Micropolitan', 'Small town', 'Rural']\n",
"plot_df = band_compare.reindex(order).fillna(0)\n",
"\n",
"ax = axes[0]\n",
"x = range(len(plot_df))\n",
"width = 0.38\n",
"ax.bar([i - width/2 for i in x], plot_df['dcs_pct'], width, label='Data centers', color='steelblue')\n",
"ax.bar([i + width/2 for i in x], plot_df['tracts_pct'], width, label='All US tracts', color='lightgray', edgecolor='gray')\n",
"ax.set_xticks(list(x))\n",
"ax.set_xticklabels(plot_df.index, rotation=15)\n",
"ax.set_ylabel('% of total')\n",
"ax.set_title('DC share vs. national tract share, by RUCA band')\n",
"ax.legend()\n",
"\n",
"ax = axes[1]\n",
"colors = ['firebrick' if v > 1 else 'steelblue' for v in plot_df['over_index']]\n",
"ax.barh(plot_df.index, plot_df['over_index'], color=colors)\n",
"ax.axvline(1.0, color='black', linestyle='--', lw=1)\n",
"ax.set_xlabel('Over-index (1.0 = at parity with national)')\n",
"ax.set_title('How much DCs over- or under-represent each RUCA band')\n",
"for i, v in enumerate(plot_df['over_index']):\n",
" ax.text(v, i, f' {v:.2f}x', va='center')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17",
"metadata": {},
"outputs": [],
"source": [
"# The non-metro tail: who's building in rural / small-town / micropolitan tracts?\n",
"# These are often the most interesting builds (hyperscale greenfield, low-cost power).\n",
"nonmetro = dc_ruca[dc_ruca['ruca_band'].isin(['Rural', 'Small town', 'Micropolitan'])].copy()\n",
"\n",
"print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n",
"\n",
"# Top operators in non-metro tracts.\n",
"print('Top operators in non-metro tracts:')\n",
"top_ops = (\n",
" nonmetro.groupby('operator', dropna=False)\n",
" .agg(dcs=('master_id', 'count'),\n",
" total_power_mw=('power_mw', 'sum'),\n",
" median_power_mw=('power_mw', 'median'))\n",
" .sort_values('dcs', ascending=False)\n",
" .head(15)\n",
" .round(1)\n",
")\n",
"print(top_ops, '\\n')\n",
"\n",
"# Top states in non-metro tracts.\n",
"print('Top states for non-metro DCs:')\n",
"top_states = (\n",
" nonmetro.groupby('state', dropna=False)\n",
" .agg(dcs=('master_id', 'count'),\n",
" total_power_mw=('power_mw', 'sum'),\n",
" med_pop_density=('tract_pop_density', 'median'))\n",
" .sort_values('dcs', ascending=False)\n",
" .head(10)\n",
" .round(1)\n",
")\n",
"print(top_states, '\\n')\n",
"\n",
"# The biggest non-metro builds by power.\n",
"print('Largest non-metro DCs by stated power_mw:')\n",
"big_nonmetro = (\n",
" nonmetro.dropna(subset=['power_mw'])\n",
" .nlargest(15, 'power_mw')\n",
" [['name', 'operator', 'city', 'state', 'power_mw',\n",
" 'primary_ruca', 'primary_ruca_description', 'tract_pop_density']]\n",
" .reset_index(drop=True)\n",
")\n",
"big_nonmetro\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18",
"metadata": {},
"outputs": [],
"source": [
"# power_mw coverage across the DC universe — most rows are null, which is why\n",
"# \"biggest non-metro by power\" surfaced only a handful.\n",
"coverage = (\n",
" dc_ruca.assign(has_power=dc_ruca['power_mw'].notna())\n",
" .groupby('ruca_band', dropna=False)\n",
" .agg(dcs=('master_id', 'count'),\n",
" with_power_mw=('has_power', 'sum'))\n",
")\n",
"coverage['pct_with_power'] = (100 * coverage['with_power_mw'] / coverage['dcs']).round(1)\n",
"print('power_mw coverage by RUCA band:')\n",
"print(coverage)\n",
"print(f\"\\nOverall: {dc_ruca['power_mw'].notna().sum():,} / {len(dc_ruca):,} DCs have power_mw \"\n",
" f\"({100*dc_ruca['power_mw'].notna().mean():.1f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19",
"metadata": {},
"outputs": [],
"source": [
"# Now that EIA nameplate_capacity_mw is loaded, size non-metro DCs by the\n",
"# generation capacity within 50 km of each site (instead of the sparse power_mw).\n",
"# Re-derive non-metro slice from the updated joined_df.\n",
"nonmetro = joined_df[joined_df['primary_ruca'].isin([2,3,4,5,6,7,8,9,10])].copy()\n",
"nonmetro['ruca_band'] = nonmetro['primary_ruca'].apply(ruca_band)\n",
"\n",
"print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n",
"\n",
"# Largest non-metro DCs ranked by nearby grid capacity.\n",
"big_by_grid = (\n",
" nonmetro.sort_values('eia_capacity_mw', ascending=False)\n",
" .head(20)\n",
" [['name', 'operator', 'city', 'state',\n",
" 'primary_ruca', 'primary_ruca_description',\n",
" 'eia_capacity_mw', 'eia_capacity_nuclear', 'eia_capacity_hydro',\n",
" 'eia_capacity_ng', 'eia_capacity_coal',\n",
" 'eia_capacity_solar', 'eia_capacity_wind']]\n",
" .round(0)\n",
" .reset_index(drop=True)\n",
")\n",
"print('Largest non-metro DCs by nearby grid capacity (50 km):')\n",
"big_by_grid\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20",
"metadata": {},
"outputs": [],
"source": [
"# Hyperscalers' non-metro footprint, sized by surrounding grid capacity + fuel mix.\n",
"hyperscaler_map = {\n",
" 'Amazon Web Services': 'AWS', 'Amazon AWS': 'AWS', 'Amazon': 'AWS',\n",
" 'Microsoft': 'Microsoft',\n",
" 'Meta': 'Meta', 'Meta, Inc.': 'Meta', 'Facebook': 'Meta',\n",
" 'Google': 'Google', 'Alphabet': 'Google',\n",
" 'Apple': 'Apple',\n",
" 'Oracle': 'Oracle',\n",
" 'Yahoo': 'Yahoo',\n",
"}\n",
"nonmetro['op_group'] = nonmetro['operator'].map(hyperscaler_map).fillna(\n",
" nonmetro['operator'].where(nonmetro['operator'].notna(), 'Unknown')\n",
")\n",
"\n",
"hyperscaler_view = (\n",
" nonmetro[nonmetro['op_group'].isin(['AWS','Microsoft','Meta','Google','Apple','Oracle','Yahoo','Unknown'])]\n",
" .groupby('op_group')\n",
" .agg(\n",
" dcs=('master_id', 'count'),\n",
" states=('state', 'nunique'),\n",
" sum_nearby_capacity_mw=('eia_capacity_mw', 'sum'),\n",
" median_nearby_capacity_mw=('eia_capacity_mw', 'median'),\n",
" sum_nearby_hydro_mw=('eia_capacity_hydro', 'sum'),\n",
" sum_nearby_nuclear_mw=('eia_capacity_nuclear', 'sum'),\n",
" sum_nearby_ng_mw=('eia_capacity_ng', 'sum'),\n",
" sum_nearby_solar_mw=('eia_capacity_solar', 'sum'),\n",
" sum_nearby_wind_mw=('eia_capacity_wind', 'sum'),\n",
" )\n",
" .sort_values('dcs', ascending=False)\n",
" .round(0)\n",
")\n",
"print(\"Non-metro DCs by operator group, sized by aggregate nearby grid capacity:\")\n",
"hyperscaler_view\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}