Adds three coordinated changes: - Request nameplate, summer, and winter capacity from the EIA operating-generator-capacity endpoint and project them as typed columns on energy_eia_operating_generator_capacity_flat. The original ingest only pulled latitude and longitude, leaving the flat table with no MW values despite its name. - New cluster_analysis.ipynb joins master_data_centers to ACS-2024 demographics, USDA RUCA-2020 codes (loaded from new/), and EIA generation capacity within 50 km of each site. - Summary doc consolidates the headline findings: DC tracts skew higher income / more educated / more racially diverse than US average, the metro over-index is only 1.11x, the non-metro tail is dominated by hyperscalers in the Columbia River corridor (OR+WA = 66% of non-metro DCs), and Microsoft co-locates with Palo Verde Nuclear in Goodyear AZ. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
842 lines
34 KiB
Plaintext
842 lines
34 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Clustering Analysis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"from pathlib import Path\n",
|
||
"\n",
|
||
"import pandas as pd\n",
|
||
"import psycopg2\n",
|
||
"\n",
|
||
"\n",
|
||
"def load_env_file(env_path: str = '.env') -> None:\n",
|
||
" p = Path(env_path)\n",
|
||
" if not p.exists():\n",
|
||
" print(f'No {env_path} file found in {Path.cwd()}')\n",
|
||
" return\n",
|
||
" loaded = 0\n",
|
||
" for raw_line in p.read_text(encoding='utf-8').splitlines():\n",
|
||
" line = raw_line.strip()\n",
|
||
" if not line or line.startswith('#') or '=' not in line:\n",
|
||
" continue\n",
|
||
" key, value = line.split('=', 1)\n",
|
||
" key = key.strip()\n",
|
||
" value = value.strip().strip('\"').strip(\"'\")\n",
|
||
" if key and key not in os.environ:\n",
|
||
" os.environ[key] = value\n",
|
||
" loaded += 1\n",
|
||
" print(f'Loaded {loaded} env var(s) from {env_path}')\n",
|
||
"\n",
|
||
"\n",
|
||
"def require_env(keys):\n",
|
||
" missing = [k for k in keys if not os.getenv(k)]\n",
|
||
" if missing:\n",
|
||
" raise EnvironmentError(\n",
|
||
" 'Missing required env vars: ' + ', '.join(missing) +\n",
|
||
" '.\\nSet them in this notebook, or add them to a .env file.'\n",
|
||
" )\n",
|
||
"\n",
|
||
"\n",
|
||
"load_env_file('.env')\n",
|
||
"\n",
|
||
"required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n",
|
||
"require_env(required_keys)\n",
|
||
"\n",
|
||
"DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_conn():\n",
|
||
" return psycopg2.connect(\n",
|
||
" host=os.environ['PGWEB_HOST'],\n",
|
||
" port=os.environ['PGWEB_PORT'],\n",
|
||
" user=os.environ['PGWEB_USER'],\n",
|
||
" password=os.environ['PGWEB_PASSWORD'],\n",
|
||
" dbname='data_centers',\n",
|
||
" )\n",
|
||
"\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" cur.execute('select current_database(), current_user, version()')\n",
|
||
" db, usr, ver = cur.fetchone()\n",
|
||
" print('Connected to DB:', db)\n",
|
||
" print('As user:', usr)\n",
|
||
" print('Postgres:', ver.split(',')[0])\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# List tables in the database (user schemas only, excluding system + PostGIS internals).\n",
|
||
"TABLES_SQL = \"\"\"\n",
|
||
"select\n",
|
||
" table_schema,\n",
|
||
" table_name,\n",
|
||
" table_type\n",
|
||
"from information_schema.tables\n",
|
||
"where table_schema not in ('pg_catalog', 'information_schema', 'tiger', 'tiger_data', 'topology')\n",
|
||
"order by table_schema, table_name\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" tables_df = pd.read_sql(TABLES_SQL, conn)\n",
|
||
"\n",
|
||
"print(f'{len(tables_df):,} tables/views found')\n",
|
||
"tables_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Inspect columns for the tables we want to join.\n",
|
||
"INSPECT_TABLES = [\n",
|
||
" 'master_data_centers',\n",
|
||
" 'master_data_center_spatial_clusters',\n",
|
||
" 'data_center_census_tracts_2024',\n",
|
||
" '_dc_census_tract_acs_2024',\n",
|
||
" 'energy_eia_operating_generator_capacity_flat',\n",
|
||
"]\n",
|
||
"\n",
|
||
"COLS_SQL = \"\"\"\n",
|
||
"select table_name, column_name, data_type\n",
|
||
"from information_schema.columns\n",
|
||
"where table_schema = 'public' and table_name = any(%s)\n",
|
||
"order by table_name, ordinal_position\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" cols_df = pd.read_sql(COLS_SQL, conn, params=(INSPECT_TABLES,))\n",
|
||
"\n",
|
||
"for t in INSPECT_TABLES:\n",
|
||
" sub = cols_df[cols_df['table_name'] == t]\n",
|
||
" print(f'\\n=== {t} ({len(sub)} cols) ===')\n",
|
||
" print(sub[['column_name', 'data_type']].to_string(index=False))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Ingest RUCA codes\n",
|
||
"\n",
|
||
"USDA Rural-Urban Commuting Area (RUCA) codes classify each census tract on a 1–10 scale from \"Metropolitan area core\" (1) to \"Rural area\" (10), based on population density and commuting flows. Source file: `new/RUCA-codes-2020-tract.csv` (~85K tracts).\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Push RUCA codes CSV -> public.ruca_codes_2020_tract (idempotent: drops + recreates).\n",
|
||
"from psycopg2.extras import execute_values\n",
|
||
"\n",
|
||
"RUCA_CSV = Path('new/RUCA-codes-2020-tract.csv')\n",
|
||
"RUCA_TABLE = 'public.ruca_codes_2020_tract'\n",
|
||
"\n",
|
||
"# Map source CSV columns -> snake_case DB columns.\n",
|
||
"COL_MAP = {\n",
|
||
" 'TractFIPS23': 'tract_fips_23',\n",
|
||
" 'CountyFIPS23': 'county_fips_23',\n",
|
||
" 'CountyCode23': 'county_code_23',\n",
|
||
" 'CountyName23': 'county_name_23',\n",
|
||
" 'TractFIPS20': 'tract_fips_20',\n",
|
||
" 'TractCode20': 'tract_code_20',\n",
|
||
" 'TractName20': 'tract_name_20',\n",
|
||
" 'CountyFIPS20': 'county_fips_20',\n",
|
||
" 'CountyCode20': 'county_code_20',\n",
|
||
" 'CountyName20': 'county_name_20',\n",
|
||
" 'StateFIPS20': 'state_fips_20',\n",
|
||
" 'StateName20': 'state_name_20',\n",
|
||
" 'UrbanAreaCode20': 'urban_area_code_20',\n",
|
||
" 'UrbanAreaName20': 'urban_area_name_20',\n",
|
||
" 'UrbanCore': 'urban_core',\n",
|
||
" 'UrbanCoreType': 'urban_core_type',\n",
|
||
" 'PrimaryRUCA': 'primary_ruca',\n",
|
||
" 'PrimaryRUCADescription': 'primary_ruca_description',\n",
|
||
" 'PrimaryDestinationCode': 'primary_destination_code',\n",
|
||
" 'PrimaryDestinationName': 'primary_destination_name',\n",
|
||
" 'SecondaryRUCA': 'secondary_ruca',\n",
|
||
" 'SecondaryRUCADescription': 'secondary_ruca_description',\n",
|
||
" 'SecondaryDestinationCode': 'secondary_destination_code',\n",
|
||
" 'SecondaryDestinationName': 'secondary_destination_name',\n",
|
||
" 'Population': 'population',\n",
|
||
" 'LandArea': 'land_area',\n",
|
||
" 'PopDensity': 'pop_density',\n",
|
||
"}\n",
|
||
"\n",
|
||
"# File is Latin-1 (has bytes like 0xf1 = ñ from Spanish place names).\n",
|
||
"fips_str_cols = [c for c in COL_MAP if 'FIPS' in c or 'Code' in c]\n",
|
||
"ruca_df = pd.read_csv(\n",
|
||
" RUCA_CSV,\n",
|
||
" dtype={c: str for c in fips_str_cols},\n",
|
||
" encoding='latin-1',\n",
|
||
")\n",
|
||
"ruca_df = ruca_df.rename(columns=COL_MAP)\n",
|
||
"print(f'CSV rows: {len(ruca_df):,} cols: {ruca_df.shape[1]}')\n",
|
||
"\n",
|
||
"# PK is tract_fips_20 (always populated). Some tracts that existed in 2020 are gone\n",
|
||
"# in 2023 (water-only tracts, dissolves), so tract_fips_23 can be null.\n",
|
||
"DDL = f\"\"\"\n",
|
||
"drop table if exists {RUCA_TABLE};\n",
|
||
"create table {RUCA_TABLE} (\n",
|
||
" tract_fips_23 text,\n",
|
||
" county_fips_23 text,\n",
|
||
" county_code_23 text,\n",
|
||
" county_name_23 text,\n",
|
||
" tract_fips_20 text primary key,\n",
|
||
" tract_code_20 text,\n",
|
||
" tract_name_20 text,\n",
|
||
" county_fips_20 text,\n",
|
||
" county_code_20 text,\n",
|
||
" county_name_20 text,\n",
|
||
" state_fips_20 text,\n",
|
||
" state_name_20 text,\n",
|
||
" urban_area_code_20 text,\n",
|
||
" urban_area_name_20 text,\n",
|
||
" urban_core smallint,\n",
|
||
" urban_core_type text,\n",
|
||
" primary_ruca smallint,\n",
|
||
" primary_ruca_description text,\n",
|
||
" primary_destination_code text,\n",
|
||
" primary_destination_name text,\n",
|
||
" secondary_ruca text,\n",
|
||
" secondary_ruca_description text,\n",
|
||
" secondary_destination_code text,\n",
|
||
" secondary_destination_name text,\n",
|
||
" population integer,\n",
|
||
" land_area double precision,\n",
|
||
" pop_density double precision\n",
|
||
");\n",
|
||
"create index ruca_codes_2020_tract_state_idx on {RUCA_TABLE} (state_fips_20);\n",
|
||
"create index ruca_codes_2020_tract_primary_ruca_idx on {RUCA_TABLE} (primary_ruca);\n",
|
||
"create index ruca_codes_2020_tract_fips_23_idx on {RUCA_TABLE} (tract_fips_23);\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"cols = list(COL_MAP.values())\n",
|
||
"records = [tuple(None if pd.isna(v) else v for v in row) for row in ruca_df[cols].itertuples(index=False, name=None)]\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" cur.execute(DDL)\n",
|
||
" execute_values(\n",
|
||
" cur,\n",
|
||
" f\"insert into {RUCA_TABLE} ({', '.join(cols)}) values %s\",\n",
|
||
" records,\n",
|
||
" page_size=2000,\n",
|
||
" )\n",
|
||
" cur.execute(f'select count(*) from {RUCA_TABLE}')\n",
|
||
" print(f'{RUCA_TABLE}: {cur.fetchone()[0]:,} rows')\n",
|
||
" cur.execute(f\"\"\"\n",
|
||
" select primary_ruca, count(*) as n\n",
|
||
" from {RUCA_TABLE}\n",
|
||
" group by 1 order by 1\n",
|
||
" \"\"\")\n",
|
||
" print('\\nRUCA distribution (all US tracts):')\n",
|
||
" for ruca, n in cur.fetchall():\n",
|
||
" print(f' {ruca}: {n:>6,}')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Sanity-check EIA generator coordinate ranges.\n",
|
||
"# Prior note: EIA source data had a longitude sign error. Verify before spatial joining.\n",
|
||
"EIA_COORD_SQL = \"\"\"\n",
|
||
"select\n",
|
||
" min(longitude) as lon_min, max(longitude) as lon_max,\n",
|
||
" min(latitude) as lat_min, max(latitude) as lat_max,\n",
|
||
" count(*) filter (where longitude > 0) as pos_lon_rows,\n",
|
||
" count(*) filter (where longitude < 0) as neg_lon_rows,\n",
|
||
" count(*) as total_rows\n",
|
||
"from public.energy_eia_operating_generator_capacity_flat\n",
|
||
"where longitude is not null and latitude is not null\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" eia_coord_df = pd.read_sql(EIA_COORD_SQL, conn)\n",
|
||
"\n",
|
||
"print(eia_coord_df.T)\n",
|
||
"# For US plants we expect longitude in roughly [-180, -65]. If pos_lon_rows is large,\n",
|
||
"# the sign-flip correction is still needed when spatial-joining.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Build the joined analysis dataset.\n",
|
||
"#\n",
|
||
"# Joins:\n",
|
||
"# master_data_centers (m)\n",
|
||
"# LEFT JOIN master_data_center_spatial_clusters (c) ON master_id\n",
|
||
"# LEFT JOIN _dc_census_tract_acs_2024 (acs) ON m.geoid = acs.geoid\n",
|
||
"# LEFT JOIN ruca_codes_2020_tract (ruca) ON m.geoid = ruca.tract_fips_20\n",
|
||
"# LEFT JOIN (EIA operating generators within RADIUS_KM, latest period) aggregated per DC\n",
|
||
"#\n",
|
||
"# Energy aggregation: latest period, status='OP', sum of nameplate_capacity_mw\n",
|
||
"# (and counts) within RADIUS_KM, broken out by fuel.\n",
|
||
"\n",
|
||
"RADIUS_KM = 50\n",
|
||
"\n",
|
||
"JOIN_SQL = f\"\"\"\n",
|
||
"with latest_period as (\n",
|
||
" select max(period) as period\n",
|
||
" from public.energy_eia_operating_generator_capacity_flat\n",
|
||
"),\n",
|
||
"eia_latest as (\n",
|
||
" select e.plant_id, e.generator_id, e.energy_source_code,\n",
|
||
" e.nameplate_capacity_mw, e.geom\n",
|
||
" from public.energy_eia_operating_generator_capacity_flat e\n",
|
||
" join latest_period lp on e.period = lp.period\n",
|
||
" where e.status = 'OP' and e.geom is not null\n",
|
||
"),\n",
|
||
"energy_nearby as (\n",
|
||
" select\n",
|
||
" m.master_id,\n",
|
||
" count(*) as eia_gen_count,\n",
|
||
" count(distinct plant_id) as eia_plant_count,\n",
|
||
" sum(nameplate_capacity_mw) as eia_capacity_mw,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'NG') as eia_capacity_ng,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code in ('BIT','SUB','LIG','RC','ANT')) as eia_capacity_coal,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'NUC') as eia_capacity_nuclear,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'SUN') as eia_capacity_solar,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'WND') as eia_capacity_wind,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'WAT') as eia_capacity_hydro,\n",
|
||
" sum(nameplate_capacity_mw) filter (where energy_source_code = 'GEO') as eia_capacity_geothermal\n",
|
||
" from public.master_data_centers m\n",
|
||
" join eia_latest e\n",
|
||
" on st_dwithin(m.geom::geography, e.geom::geography, {RADIUS_KM} * 1000)\n",
|
||
" where m.geom is not null\n",
|
||
" group by m.master_id\n",
|
||
")\n",
|
||
"select\n",
|
||
" m.master_id, m.name, m.operator, m.city, m.state, m.country,\n",
|
||
" m.power_mw, m.area_sqft, m.longitude, m.latitude, m.geoid,\n",
|
||
" c.cluster_id, c.is_noise, c.nearest_neighbor_km,\n",
|
||
" acs.population, acs.median_age, acs.households, acs.avg_household_size,\n",
|
||
" acs.median_household_income, acs.per_capita_income,\n",
|
||
" acs.poverty_rate, acs.unemployment_rate,\n",
|
||
" acs.bachelor_or_higher_pct, acs.broadband_subscription_pct,\n",
|
||
" acs.hispanic_latino_pct, acs.hispanic_latino_population,\n",
|
||
" acs.non_hispanic_white_pct, acs.non_hispanic_white_population,\n",
|
||
" acs.non_hispanic_black_pct, acs.non_hispanic_black_population,\n",
|
||
" acs.non_hispanic_asian_pct, acs.non_hispanic_asian_population,\n",
|
||
" acs.primary_industry, acs.primary_industry_pct,\n",
|
||
" ruca.primary_ruca, ruca.primary_ruca_description,\n",
|
||
" ruca.urban_core, ruca.urban_core_type,\n",
|
||
" ruca.pop_density as tract_pop_density,\n",
|
||
" ruca.land_area as tract_land_area_sqmi,\n",
|
||
" coalesce(en.eia_gen_count, 0) as eia_gen_count,\n",
|
||
" coalesce(en.eia_plant_count, 0) as eia_plant_count,\n",
|
||
" coalesce(en.eia_capacity_mw, 0) as eia_capacity_mw,\n",
|
||
" coalesce(en.eia_capacity_ng, 0) as eia_capacity_ng,\n",
|
||
" coalesce(en.eia_capacity_coal, 0) as eia_capacity_coal,\n",
|
||
" coalesce(en.eia_capacity_nuclear, 0) as eia_capacity_nuclear,\n",
|
||
" coalesce(en.eia_capacity_solar, 0) as eia_capacity_solar,\n",
|
||
" coalesce(en.eia_capacity_wind, 0) as eia_capacity_wind,\n",
|
||
" coalesce(en.eia_capacity_hydro, 0) as eia_capacity_hydro,\n",
|
||
" coalesce(en.eia_capacity_geothermal, 0) as eia_capacity_geothermal\n",
|
||
"from public.master_data_centers m\n",
|
||
"left join public.master_data_center_spatial_clusters c on c.master_id = m.master_id\n",
|
||
"left join public._dc_census_tract_acs_2024 acs on acs.geoid = m.geoid\n",
|
||
"left join public.ruca_codes_2020_tract ruca on ruca.tract_fips_20 = m.geoid\n",
|
||
"left join energy_nearby en on en.master_id = m.master_id\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"with get_conn() as conn:\n",
|
||
" joined_df = pd.read_sql(JOIN_SQL, conn)\n",
|
||
"\n",
|
||
"print(f'rows: {len(joined_df):,} cols: {joined_df.shape[1]}')\n",
|
||
"print('non-null geoid: ', joined_df['geoid'].notna().sum())\n",
|
||
"print('non-null cluster_id: ', joined_df['cluster_id'].notna().sum())\n",
|
||
"print('non-null primary_ruca: ', joined_df['primary_ruca'].notna().sum())\n",
|
||
"print('DCs with >=1 nearby gen: ', (joined_df['eia_gen_count'] > 0).sum())\n",
|
||
"print(f\"median nearby capacity: {joined_df['eia_capacity_mw'].median():,.0f} MW\")\n",
|
||
"print(f\" 90th percentile: {joined_df['eia_capacity_mw'].quantile(0.9):,.0f} MW\")\n",
|
||
"print(f\" max: {joined_df['eia_capacity_mw'].max():,.0f} MW\")\n",
|
||
"joined_df.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Quick demographic analysis\n",
|
||
"\n",
|
||
"The joined dataset has one row per data center, enriched with the demographics of its containing census tract. Note that multiple DCs can share a tract, so tract-level stats are weighted by DC count in these summaries (i.e. \"the average DC sits in a tract with...\" rather than \"the average DC-hosting tract has...\").\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Top-line demographic profile of the average DC's containing tract.\n",
|
||
"demo_cols = [\n",
|
||
" 'population', 'median_age', 'avg_household_size',\n",
|
||
" 'median_household_income', 'per_capita_income',\n",
|
||
" 'poverty_rate', 'unemployment_rate',\n",
|
||
" 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n",
|
||
" 'hispanic_latino_pct', 'non_hispanic_white_pct',\n",
|
||
" 'non_hispanic_black_pct', 'non_hispanic_asian_pct',\n",
|
||
"]\n",
|
||
"demo_cols = [c for c in demo_cols if c in joined_df.columns]\n",
|
||
"\n",
|
||
"summary = joined_df[demo_cols].agg(['count', 'mean', 'median', 'std', 'min', 'max']).round(2).T\n",
|
||
"summary.columns = ['n', 'mean', 'median', 'std', 'min', 'max']\n",
|
||
"\n",
|
||
"# US national benchmarks (ACS 5-yr ~2024) for context\n",
|
||
"benchmarks = {\n",
|
||
" 'median_household_income': 78_538,\n",
|
||
" 'per_capita_income': 43_313,\n",
|
||
" 'poverty_rate': 12.4,\n",
|
||
" 'unemployment_rate': 5.4,\n",
|
||
" 'bachelor_or_higher_pct': 35.0,\n",
|
||
" 'broadband_subscription_pct': 89.0,\n",
|
||
" 'hispanic_latino_pct': 19.5,\n",
|
||
" 'non_hispanic_white_pct': 58.4,\n",
|
||
" 'non_hispanic_black_pct': 12.1,\n",
|
||
" 'non_hispanic_asian_pct': 6.4,\n",
|
||
"}\n",
|
||
"summary['us_avg'] = pd.Series(benchmarks).reindex(summary.index)\n",
|
||
"summary['vs_us'] = (summary['mean'] - summary['us_avg']).round(2)\n",
|
||
"summary\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "10",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Geographic concentration: where are the data centers, and what do those places look like?\n",
|
||
"state_summary = (\n",
|
||
" joined_df.groupby('state', dropna=False)\n",
|
||
" .agg(\n",
|
||
" dc_count=('master_id', 'count'),\n",
|
||
" avg_power_mw=('power_mw', 'mean'),\n",
|
||
" total_power_mw=('power_mw', 'sum'),\n",
|
||
" median_hh_income=('median_household_income', 'median'),\n",
|
||
" median_poverty=('poverty_rate', 'median'),\n",
|
||
" median_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n",
|
||
" median_broadband_pct=('broadband_subscription_pct', 'median'),\n",
|
||
" median_pct_white=('non_hispanic_white_pct', 'median'),\n",
|
||
" median_pct_hispanic=('hispanic_latino_pct', 'median'),\n",
|
||
" median_pct_black=('non_hispanic_black_pct', 'median'),\n",
|
||
" )\n",
|
||
" .sort_values('dc_count', ascending=False)\n",
|
||
" .round(1)\n",
|
||
")\n",
|
||
"print(f'{joined_df[\"state\"].nunique()} states/territories represented')\n",
|
||
"state_summary.head(15)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "11",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Cluster vs. non-cluster: do DCs in spatial clusters sit in different demographic settings\n",
|
||
"# than isolated ones? (cluster_id is null/is_noise=True for unclustered DCs.)\n",
|
||
"joined_df['in_cluster'] = joined_df['cluster_id'].notna() & (joined_df['is_noise'] != True)\n",
|
||
"\n",
|
||
"compare_cols = [\n",
|
||
" 'median_household_income', 'per_capita_income',\n",
|
||
" 'poverty_rate', 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n",
|
||
" 'non_hispanic_white_pct', 'hispanic_latino_pct', 'non_hispanic_black_pct',\n",
|
||
" 'population', 'eia_gen_count',\n",
|
||
"]\n",
|
||
"compare_cols = [c for c in compare_cols if c in joined_df.columns]\n",
|
||
"\n",
|
||
"cluster_compare = (\n",
|
||
" joined_df.groupby('in_cluster')[compare_cols]\n",
|
||
" .median()\n",
|
||
" .round(1)\n",
|
||
" .T\n",
|
||
" .rename(columns={False: 'isolated', True: 'in_cluster'})\n",
|
||
")\n",
|
||
"cluster_compare['delta'] = (cluster_compare['in_cluster'] - cluster_compare['isolated']).round(1)\n",
|
||
"print(f\"DCs in a cluster: {joined_df['in_cluster'].sum():,} isolated: {(~joined_df['in_cluster']).sum():,}\")\n",
|
||
"cluster_compare\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "12",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Quick visual sweep: distribution of key demographic features for DC tracts,\n",
|
||
"# with US-average reference lines for context.\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"panels = [\n",
|
||
" ('median_household_income', 78_538, 'Median household income (USD)'),\n",
|
||
" ('poverty_rate', 12.4, 'Poverty rate (%)'),\n",
|
||
" ('bachelor_or_higher_pct', 35.0, \"Bachelor's degree or higher (%)\"),\n",
|
||
" ('broadband_subscription_pct', 89.0, 'Broadband subscription (%)'),\n",
|
||
" ('non_hispanic_white_pct', 58.4, 'Non-Hispanic white (%)'),\n",
|
||
" ('hispanic_latino_pct', 19.5, 'Hispanic/Latino (%)'),\n",
|
||
"]\n",
|
||
"panels = [(c, b, lab) for c, b, lab in panels if c in joined_df.columns]\n",
|
||
"\n",
|
||
"fig, axes = plt.subplots(2, 3, figsize=(15, 8))\n",
|
||
"for ax, (col, bench, label) in zip(axes.ravel(), panels):\n",
|
||
" s = joined_df[col].dropna()\n",
|
||
" ax.hist(s, bins=40, color='steelblue', edgecolor='white', alpha=0.85)\n",
|
||
" ax.axvline(s.median(), color='darkorange', linestyle='-', lw=2, label=f'DC median = {s.median():.1f}')\n",
|
||
" ax.axvline(bench, color='firebrick', linestyle='--', lw=2, label=f'US avg = {bench}')\n",
|
||
" ax.set_title(label)\n",
|
||
" ax.legend(fontsize=8)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "13",
|
||
"metadata": {},
|
||
"source": [
|
||
"## RUCA (urban / rural) analysis\n",
|
||
"\n",
|
||
"RUCA primary code key:\n",
|
||
"- **1**: Metropolitan area core\n",
|
||
"- **2**: Metropolitan area high commuting\n",
|
||
"- **3**: Metropolitan area low commuting\n",
|
||
"- **4–6**: Micropolitan area (small city + commuting tracts)\n",
|
||
"- **7–9**: Small town (core + commuting tracts)\n",
|
||
"- **10**: Rural area\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "14",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# DC distribution across RUCA codes vs. the national baseline of all US tracts.\n",
|
||
"ruca_buckets = {\n",
|
||
" 1: 'Metro core', 2: 'Metro high-commute', 3: 'Metro low-commute',\n",
|
||
" 4: 'Micro core', 5: 'Micro high-commute', 6: 'Micro low-commute',\n",
|
||
" 7: 'Small town core', 8: 'Small town high-commute', 9: 'Small town low-commute',\n",
|
||
" 10: 'Rural',\n",
|
||
"}\n",
|
||
"\n",
|
||
"def ruca_band(r):\n",
|
||
" if pd.isna(r): return 'Unknown'\n",
|
||
" r = int(r)\n",
|
||
" if r <= 3: return 'Metropolitan'\n",
|
||
" if r <= 6: return 'Micropolitan'\n",
|
||
" if r <= 9: return 'Small town'\n",
|
||
" return 'Rural'\n",
|
||
"\n",
|
||
"dc_ruca = joined_df.copy()\n",
|
||
"dc_ruca['ruca_label'] = dc_ruca['primary_ruca'].map(ruca_buckets)\n",
|
||
"dc_ruca['ruca_band'] = dc_ruca['primary_ruca'].apply(ruca_band)\n",
|
||
"\n",
|
||
"# National baseline (share of US tracts in each band).\n",
|
||
"NATIONAL_SQL = \"\"\"\n",
|
||
"select\n",
|
||
" case\n",
|
||
" when primary_ruca between 1 and 3 then 'Metropolitan'\n",
|
||
" when primary_ruca between 4 and 6 then 'Micropolitan'\n",
|
||
" when primary_ruca between 7 and 9 then 'Small town'\n",
|
||
" when primary_ruca = 10 then 'Rural'\n",
|
||
" else 'Unknown'\n",
|
||
" end as ruca_band,\n",
|
||
" count(*) as tracts\n",
|
||
"from public.ruca_codes_2020_tract\n",
|
||
"group by 1\n",
|
||
"\"\"\"\n",
|
||
"with get_conn() as conn:\n",
|
||
" national_df = pd.read_sql(NATIONAL_SQL, conn)\n",
|
||
"national_df['tracts_pct'] = (100 * national_df['tracts'] / national_df['tracts'].sum()).round(1)\n",
|
||
"\n",
|
||
"dc_by_band = (\n",
|
||
" dc_ruca.groupby('ruca_band').size().rename('dcs').to_frame()\n",
|
||
" .assign(dcs_pct=lambda d: (100 * d['dcs'] / d['dcs'].sum()).round(1))\n",
|
||
")\n",
|
||
"band_compare = dc_by_band.join(national_df.set_index('ruca_band')[['tracts', 'tracts_pct']])\n",
|
||
"band_compare['over_index'] = (band_compare['dcs_pct'] / band_compare['tracts_pct']).round(2)\n",
|
||
"print('Data centers vs. all US tracts, by RUCA band:')\n",
|
||
"print(band_compare.reindex(['Metropolitan', 'Micropolitan', 'Small town', 'Rural', 'Unknown']).fillna(0))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "15",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fine-grained RUCA breakdown: DC count, median power, demographics, energy infra\n",
|
||
"# at each of the 10 RUCA codes.\n",
|
||
"ruca_profile = (\n",
|
||
" dc_ruca.groupby('primary_ruca', dropna=False)\n",
|
||
" .agg(\n",
|
||
" dcs=('master_id', 'count'),\n",
|
||
" median_power_mw=('power_mw', 'median'),\n",
|
||
" total_power_mw=('power_mw', 'sum'),\n",
|
||
" med_hh_income=('median_household_income', 'median'),\n",
|
||
" med_poverty=('poverty_rate', 'median'),\n",
|
||
" med_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n",
|
||
" med_pct_white=('non_hispanic_white_pct', 'median'),\n",
|
||
" med_pct_black=('non_hispanic_black_pct', 'median'),\n",
|
||
" med_pct_hispanic=('hispanic_latino_pct', 'median'),\n",
|
||
" med_pop_density=('tract_pop_density', 'median'),\n",
|
||
" med_eia_gens_50km=('eia_gen_count', 'median'),\n",
|
||
" )\n",
|
||
" .round(1)\n",
|
||
")\n",
|
||
"ruca_profile.insert(0, 'description', ruca_profile.index.map(ruca_buckets))\n",
|
||
"print('Per-RUCA-code profile of data centers:')\n",
|
||
"ruca_profile\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "16",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Plot: DC count by RUCA band vs. national tract share.\n",
|
||
"fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
|
||
"\n",
|
||
"order = ['Metropolitan', 'Micropolitan', 'Small town', 'Rural']\n",
|
||
"plot_df = band_compare.reindex(order).fillna(0)\n",
|
||
"\n",
|
||
"ax = axes[0]\n",
|
||
"x = range(len(plot_df))\n",
|
||
"width = 0.38\n",
|
||
"ax.bar([i - width/2 for i in x], plot_df['dcs_pct'], width, label='Data centers', color='steelblue')\n",
|
||
"ax.bar([i + width/2 for i in x], plot_df['tracts_pct'], width, label='All US tracts', color='lightgray', edgecolor='gray')\n",
|
||
"ax.set_xticks(list(x))\n",
|
||
"ax.set_xticklabels(plot_df.index, rotation=15)\n",
|
||
"ax.set_ylabel('% of total')\n",
|
||
"ax.set_title('DC share vs. national tract share, by RUCA band')\n",
|
||
"ax.legend()\n",
|
||
"\n",
|
||
"ax = axes[1]\n",
|
||
"colors = ['firebrick' if v > 1 else 'steelblue' for v in plot_df['over_index']]\n",
|
||
"ax.barh(plot_df.index, plot_df['over_index'], color=colors)\n",
|
||
"ax.axvline(1.0, color='black', linestyle='--', lw=1)\n",
|
||
"ax.set_xlabel('Over-index (1.0 = at parity with national)')\n",
|
||
"ax.set_title('How much DCs over- or under-represent each RUCA band')\n",
|
||
"for i, v in enumerate(plot_df['over_index']):\n",
|
||
" ax.text(v, i, f' {v:.2f}x', va='center')\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "17",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# The non-metro tail: who's building in rural / small-town / micropolitan tracts?\n",
|
||
"# These are often the most interesting builds (hyperscale greenfield, low-cost power).\n",
|
||
"nonmetro = dc_ruca[dc_ruca['ruca_band'].isin(['Rural', 'Small town', 'Micropolitan'])].copy()\n",
|
||
"\n",
|
||
"print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n",
|
||
"\n",
|
||
"# Top operators in non-metro tracts.\n",
|
||
"print('Top operators in non-metro tracts:')\n",
|
||
"top_ops = (\n",
|
||
" nonmetro.groupby('operator', dropna=False)\n",
|
||
" .agg(dcs=('master_id', 'count'),\n",
|
||
" total_power_mw=('power_mw', 'sum'),\n",
|
||
" median_power_mw=('power_mw', 'median'))\n",
|
||
" .sort_values('dcs', ascending=False)\n",
|
||
" .head(15)\n",
|
||
" .round(1)\n",
|
||
")\n",
|
||
"print(top_ops, '\\n')\n",
|
||
"\n",
|
||
"# Top states in non-metro tracts.\n",
|
||
"print('Top states for non-metro DCs:')\n",
|
||
"top_states = (\n",
|
||
" nonmetro.groupby('state', dropna=False)\n",
|
||
" .agg(dcs=('master_id', 'count'),\n",
|
||
" total_power_mw=('power_mw', 'sum'),\n",
|
||
" med_pop_density=('tract_pop_density', 'median'))\n",
|
||
" .sort_values('dcs', ascending=False)\n",
|
||
" .head(10)\n",
|
||
" .round(1)\n",
|
||
")\n",
|
||
"print(top_states, '\\n')\n",
|
||
"\n",
|
||
"# The biggest non-metro builds by power.\n",
|
||
"print('Largest non-metro DCs by stated power_mw:')\n",
|
||
"big_nonmetro = (\n",
|
||
" nonmetro.dropna(subset=['power_mw'])\n",
|
||
" .nlargest(15, 'power_mw')\n",
|
||
" [['name', 'operator', 'city', 'state', 'power_mw',\n",
|
||
" 'primary_ruca', 'primary_ruca_description', 'tract_pop_density']]\n",
|
||
" .reset_index(drop=True)\n",
|
||
")\n",
|
||
"big_nonmetro\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "18",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# power_mw coverage across the DC universe — most rows are null, which is why\n",
|
||
"# \"biggest non-metro by power\" surfaced only a handful.\n",
|
||
"coverage = (\n",
|
||
" dc_ruca.assign(has_power=dc_ruca['power_mw'].notna())\n",
|
||
" .groupby('ruca_band', dropna=False)\n",
|
||
" .agg(dcs=('master_id', 'count'),\n",
|
||
" with_power_mw=('has_power', 'sum'))\n",
|
||
")\n",
|
||
"coverage['pct_with_power'] = (100 * coverage['with_power_mw'] / coverage['dcs']).round(1)\n",
|
||
"print('power_mw coverage by RUCA band:')\n",
|
||
"print(coverage)\n",
|
||
"print(f\"\\nOverall: {dc_ruca['power_mw'].notna().sum():,} / {len(dc_ruca):,} DCs have power_mw \"\n",
|
||
" f\"({100*dc_ruca['power_mw'].notna().mean():.1f}%)\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "19",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Now that EIA nameplate_capacity_mw is loaded, size non-metro DCs by the\n",
|
||
"# generation capacity within 50 km of each site (instead of the sparse power_mw).\n",
|
||
"# Re-derive non-metro slice from the updated joined_df.\n",
|
||
"nonmetro = joined_df[joined_df['primary_ruca'].isin([2,3,4,5,6,7,8,9,10])].copy()\n",
|
||
"nonmetro['ruca_band'] = nonmetro['primary_ruca'].apply(ruca_band)\n",
|
||
"\n",
|
||
"print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n",
|
||
"\n",
|
||
"# Largest non-metro DCs ranked by nearby grid capacity.\n",
|
||
"big_by_grid = (\n",
|
||
" nonmetro.sort_values('eia_capacity_mw', ascending=False)\n",
|
||
" .head(20)\n",
|
||
" [['name', 'operator', 'city', 'state',\n",
|
||
" 'primary_ruca', 'primary_ruca_description',\n",
|
||
" 'eia_capacity_mw', 'eia_capacity_nuclear', 'eia_capacity_hydro',\n",
|
||
" 'eia_capacity_ng', 'eia_capacity_coal',\n",
|
||
" 'eia_capacity_solar', 'eia_capacity_wind']]\n",
|
||
" .round(0)\n",
|
||
" .reset_index(drop=True)\n",
|
||
")\n",
|
||
"print('Largest non-metro DCs by nearby grid capacity (50 km):')\n",
|
||
"big_by_grid\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "20",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Hyperscalers' non-metro footprint, sized by surrounding grid capacity + fuel mix.\n",
|
||
"hyperscaler_map = {\n",
|
||
" 'Amazon Web Services': 'AWS', 'Amazon AWS': 'AWS', 'Amazon': 'AWS',\n",
|
||
" 'Microsoft': 'Microsoft',\n",
|
||
" 'Meta': 'Meta', 'Meta, Inc.': 'Meta', 'Facebook': 'Meta',\n",
|
||
" 'Google': 'Google', 'Alphabet': 'Google',\n",
|
||
" 'Apple': 'Apple',\n",
|
||
" 'Oracle': 'Oracle',\n",
|
||
" 'Yahoo': 'Yahoo',\n",
|
||
"}\n",
|
||
"nonmetro['op_group'] = nonmetro['operator'].map(hyperscaler_map).fillna(\n",
|
||
" nonmetro['operator'].where(nonmetro['operator'].notna(), 'Unknown')\n",
|
||
")\n",
|
||
"\n",
|
||
"hyperscaler_view = (\n",
|
||
" nonmetro[nonmetro['op_group'].isin(['AWS','Microsoft','Meta','Google','Apple','Oracle','Yahoo','Unknown'])]\n",
|
||
" .groupby('op_group')\n",
|
||
" .agg(\n",
|
||
" dcs=('master_id', 'count'),\n",
|
||
" states=('state', 'nunique'),\n",
|
||
" sum_nearby_capacity_mw=('eia_capacity_mw', 'sum'),\n",
|
||
" median_nearby_capacity_mw=('eia_capacity_mw', 'median'),\n",
|
||
" sum_nearby_hydro_mw=('eia_capacity_hydro', 'sum'),\n",
|
||
" sum_nearby_nuclear_mw=('eia_capacity_nuclear', 'sum'),\n",
|
||
" sum_nearby_ng_mw=('eia_capacity_ng', 'sum'),\n",
|
||
" sum_nearby_solar_mw=('eia_capacity_solar', 'sum'),\n",
|
||
" sum_nearby_wind_mw=('eia_capacity_wind', 'sum'),\n",
|
||
" )\n",
|
||
" .sort_values('dcs', ascending=False)\n",
|
||
" .round(0)\n",
|
||
")\n",
|
||
"print(\"Non-metro DCs by operator group, sized by aggregate nearby grid capacity:\")\n",
|
||
"hyperscaler_view\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.14.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|