diff --git a/cluster_analysis.ipynb b/cluster_analysis.ipynb new file mode 100644 index 0000000..78cc4d4 --- /dev/null +++ b/cluster_analysis.ipynb @@ -0,0 +1,841 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Clustering Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "import psycopg2\n", + "\n", + "\n", + "def load_env_file(env_path: str = '.env') -> None:\n", + " p = Path(env_path)\n", + " if not p.exists():\n", + " print(f'No {env_path} file found in {Path.cwd()}')\n", + " return\n", + " loaded = 0\n", + " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", + " line = raw_line.strip()\n", + " if not line or line.startswith('#') or '=' not in line:\n", + " continue\n", + " key, value = line.split('=', 1)\n", + " key = key.strip()\n", + " value = value.strip().strip('\"').strip(\"'\")\n", + " if key and key not in os.environ:\n", + " os.environ[key] = value\n", + " loaded += 1\n", + " print(f'Loaded {loaded} env var(s) from {env_path}')\n", + "\n", + "\n", + "def require_env(keys):\n", + " missing = [k for k in keys if not os.getenv(k)]\n", + " if missing:\n", + " raise EnvironmentError(\n", + " 'Missing required env vars: ' + ', '.join(missing) +\n", + " '.\\nSet them in this notebook, or add them to a .env file.'\n", + " )\n", + "\n", + "\n", + "load_env_file('.env')\n", + "\n", + "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n", + "require_env(required_keys)\n", + "\n", + "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", + "\n", + "\n", + "def get_conn():\n", + " return psycopg2.connect(\n", + " host=os.environ['PGWEB_HOST'],\n", + " port=os.environ['PGWEB_PORT'],\n", + " user=os.environ['PGWEB_USER'],\n", + " password=os.environ['PGWEB_PASSWORD'],\n", + " dbname='data_centers',\n", + " )\n", + "\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('select current_database(), current_user, version()')\n", + " db, usr, ver = cur.fetchone()\n", + " print('Connected to DB:', db)\n", + " print('As user:', usr)\n", + " print('Postgres:', ver.split(',')[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "# List tables in the database (user schemas only, excluding system + PostGIS internals).\n", + "TABLES_SQL = \"\"\"\n", + "select\n", + " table_schema,\n", + " table_name,\n", + " table_type\n", + "from information_schema.tables\n", + "where table_schema not in ('pg_catalog', 'information_schema', 'tiger', 'tiger_data', 'topology')\n", + "order by table_schema, table_name\n", + "\"\"\"\n", + "\n", + "with get_conn() as conn:\n", + " tables_df = pd.read_sql(TABLES_SQL, conn)\n", + "\n", + "print(f'{len(tables_df):,} tables/views found')\n", + "tables_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect columns for the tables we want to join.\n", + "INSPECT_TABLES = [\n", + " 'master_data_centers',\n", + " 'master_data_center_spatial_clusters',\n", + " 'data_center_census_tracts_2024',\n", + " '_dc_census_tract_acs_2024',\n", + " 'energy_eia_operating_generator_capacity_flat',\n", + "]\n", + "\n", + "COLS_SQL = \"\"\"\n", + "select table_name, column_name, data_type\n", + "from information_schema.columns\n", + "where table_schema = 'public' and table_name = any(%s)\n", + "order by table_name, ordinal_position\n", + "\"\"\"\n", + "\n", + "with get_conn() as conn:\n", + " cols_df = pd.read_sql(COLS_SQL, conn, params=(INSPECT_TABLES,))\n", + "\n", + "for t in INSPECT_TABLES:\n", + " sub = cols_df[cols_df['table_name'] == t]\n", + " print(f'\\n=== {t} ({len(sub)} cols) ===')\n", + " print(sub[['column_name', 'data_type']].to_string(index=False))\n" + ] + }, + { + "cell_type": "markdown", + "id": "4", + "metadata": {}, + "source": [ + "## Ingest RUCA codes\n", + "\n", + "USDA Rural-Urban Commuting Area (RUCA) codes classify each census tract on a 1–10 scale from \"Metropolitan area core\" (1) to \"Rural area\" (10), based on population density and commuting flows. Source file: `new/RUCA-codes-2020-tract.csv` (~85K tracts).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "# Push RUCA codes CSV -> public.ruca_codes_2020_tract (idempotent: drops + recreates).\n", + "from psycopg2.extras import execute_values\n", + "\n", + "RUCA_CSV = Path('new/RUCA-codes-2020-tract.csv')\n", + "RUCA_TABLE = 'public.ruca_codes_2020_tract'\n", + "\n", + "# Map source CSV columns -> snake_case DB columns.\n", + "COL_MAP = {\n", + " 'TractFIPS23': 'tract_fips_23',\n", + " 'CountyFIPS23': 'county_fips_23',\n", + " 'CountyCode23': 'county_code_23',\n", + " 'CountyName23': 'county_name_23',\n", + " 'TractFIPS20': 'tract_fips_20',\n", + " 'TractCode20': 'tract_code_20',\n", + " 'TractName20': 'tract_name_20',\n", + " 'CountyFIPS20': 'county_fips_20',\n", + " 'CountyCode20': 'county_code_20',\n", + " 'CountyName20': 'county_name_20',\n", + " 'StateFIPS20': 'state_fips_20',\n", + " 'StateName20': 'state_name_20',\n", + " 'UrbanAreaCode20': 'urban_area_code_20',\n", + " 'UrbanAreaName20': 'urban_area_name_20',\n", + " 'UrbanCore': 'urban_core',\n", + " 'UrbanCoreType': 'urban_core_type',\n", + " 'PrimaryRUCA': 'primary_ruca',\n", + " 'PrimaryRUCADescription': 'primary_ruca_description',\n", + " 'PrimaryDestinationCode': 'primary_destination_code',\n", + " 'PrimaryDestinationName': 'primary_destination_name',\n", + " 'SecondaryRUCA': 'secondary_ruca',\n", + " 'SecondaryRUCADescription': 'secondary_ruca_description',\n", + " 'SecondaryDestinationCode': 'secondary_destination_code',\n", + " 'SecondaryDestinationName': 'secondary_destination_name',\n", + " 'Population': 'population',\n", + " 'LandArea': 'land_area',\n", + " 'PopDensity': 'pop_density',\n", + "}\n", + "\n", + "# File is Latin-1 (has bytes like 0xf1 = ñ from Spanish place names).\n", + "fips_str_cols = [c for c in COL_MAP if 'FIPS' in c or 'Code' in c]\n", + "ruca_df = pd.read_csv(\n", + " RUCA_CSV,\n", + " dtype={c: str for c in fips_str_cols},\n", + " encoding='latin-1',\n", + ")\n", + "ruca_df = ruca_df.rename(columns=COL_MAP)\n", + "print(f'CSV rows: {len(ruca_df):,} cols: {ruca_df.shape[1]}')\n", + "\n", + "# PK is tract_fips_20 (always populated). Some tracts that existed in 2020 are gone\n", + "# in 2023 (water-only tracts, dissolves), so tract_fips_23 can be null.\n", + "DDL = f\"\"\"\n", + "drop table if exists {RUCA_TABLE};\n", + "create table {RUCA_TABLE} (\n", + " tract_fips_23 text,\n", + " county_fips_23 text,\n", + " county_code_23 text,\n", + " county_name_23 text,\n", + " tract_fips_20 text primary key,\n", + " tract_code_20 text,\n", + " tract_name_20 text,\n", + " county_fips_20 text,\n", + " county_code_20 text,\n", + " county_name_20 text,\n", + " state_fips_20 text,\n", + " state_name_20 text,\n", + " urban_area_code_20 text,\n", + " urban_area_name_20 text,\n", + " urban_core smallint,\n", + " urban_core_type text,\n", + " primary_ruca smallint,\n", + " primary_ruca_description text,\n", + " primary_destination_code text,\n", + " primary_destination_name text,\n", + " secondary_ruca text,\n", + " secondary_ruca_description text,\n", + " secondary_destination_code text,\n", + " secondary_destination_name text,\n", + " population integer,\n", + " land_area double precision,\n", + " pop_density double precision\n", + ");\n", + "create index ruca_codes_2020_tract_state_idx on {RUCA_TABLE} (state_fips_20);\n", + "create index ruca_codes_2020_tract_primary_ruca_idx on {RUCA_TABLE} (primary_ruca);\n", + "create index ruca_codes_2020_tract_fips_23_idx on {RUCA_TABLE} (tract_fips_23);\n", + "\"\"\"\n", + "\n", + "cols = list(COL_MAP.values())\n", + "records = [tuple(None if pd.isna(v) else v for v in row) for row in ruca_df[cols].itertuples(index=False, name=None)]\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(DDL)\n", + " execute_values(\n", + " cur,\n", + " f\"insert into {RUCA_TABLE} ({', '.join(cols)}) values %s\",\n", + " records,\n", + " page_size=2000,\n", + " )\n", + " cur.execute(f'select count(*) from {RUCA_TABLE}')\n", + " print(f'{RUCA_TABLE}: {cur.fetchone()[0]:,} rows')\n", + " cur.execute(f\"\"\"\n", + " select primary_ruca, count(*) as n\n", + " from {RUCA_TABLE}\n", + " group by 1 order by 1\n", + " \"\"\")\n", + " print('\\nRUCA distribution (all US tracts):')\n", + " for ruca, n in cur.fetchall():\n", + " print(f' {ruca}: {n:>6,}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity-check EIA generator coordinate ranges.\n", + "# Prior note: EIA source data had a longitude sign error. Verify before spatial joining.\n", + "EIA_COORD_SQL = \"\"\"\n", + "select\n", + " min(longitude) as lon_min, max(longitude) as lon_max,\n", + " min(latitude) as lat_min, max(latitude) as lat_max,\n", + " count(*) filter (where longitude > 0) as pos_lon_rows,\n", + " count(*) filter (where longitude < 0) as neg_lon_rows,\n", + " count(*) as total_rows\n", + "from public.energy_eia_operating_generator_capacity_flat\n", + "where longitude is not null and latitude is not null\n", + "\"\"\"\n", + "\n", + "with get_conn() as conn:\n", + " eia_coord_df = pd.read_sql(EIA_COORD_SQL, conn)\n", + "\n", + "print(eia_coord_df.T)\n", + "# For US plants we expect longitude in roughly [-180, -65]. If pos_lon_rows is large,\n", + "# the sign-flip correction is still needed when spatial-joining.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "# Build the joined analysis dataset.\n", + "#\n", + "# Joins:\n", + "# master_data_centers (m)\n", + "# LEFT JOIN master_data_center_spatial_clusters (c) ON master_id\n", + "# LEFT JOIN _dc_census_tract_acs_2024 (acs) ON m.geoid = acs.geoid\n", + "# LEFT JOIN ruca_codes_2020_tract (ruca) ON m.geoid = ruca.tract_fips_20\n", + "# LEFT JOIN (EIA operating generators within RADIUS_KM, latest period) aggregated per DC\n", + "#\n", + "# Energy aggregation: latest period, status='OP', sum of nameplate_capacity_mw\n", + "# (and counts) within RADIUS_KM, broken out by fuel.\n", + "\n", + "RADIUS_KM = 50\n", + "\n", + "JOIN_SQL = f\"\"\"\n", + "with latest_period as (\n", + " select max(period) as period\n", + " from public.energy_eia_operating_generator_capacity_flat\n", + "),\n", + "eia_latest as (\n", + " select e.plant_id, e.generator_id, e.energy_source_code,\n", + " e.nameplate_capacity_mw, e.geom\n", + " from public.energy_eia_operating_generator_capacity_flat e\n", + " join latest_period lp on e.period = lp.period\n", + " where e.status = 'OP' and e.geom is not null\n", + "),\n", + "energy_nearby as (\n", + " select\n", + " m.master_id,\n", + " count(*) as eia_gen_count,\n", + " count(distinct plant_id) as eia_plant_count,\n", + " sum(nameplate_capacity_mw) as eia_capacity_mw,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'NG') as eia_capacity_ng,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code in ('BIT','SUB','LIG','RC','ANT')) as eia_capacity_coal,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'NUC') as eia_capacity_nuclear,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'SUN') as eia_capacity_solar,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'WND') as eia_capacity_wind,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'WAT') as eia_capacity_hydro,\n", + " sum(nameplate_capacity_mw) filter (where energy_source_code = 'GEO') as eia_capacity_geothermal\n", + " from public.master_data_centers m\n", + " join eia_latest e\n", + " on st_dwithin(m.geom::geography, e.geom::geography, {RADIUS_KM} * 1000)\n", + " where m.geom is not null\n", + " group by m.master_id\n", + ")\n", + "select\n", + " m.master_id, m.name, m.operator, m.city, m.state, m.country,\n", + " m.power_mw, m.area_sqft, m.longitude, m.latitude, m.geoid,\n", + " c.cluster_id, c.is_noise, c.nearest_neighbor_km,\n", + " acs.population, acs.median_age, acs.households, acs.avg_household_size,\n", + " acs.median_household_income, acs.per_capita_income,\n", + " acs.poverty_rate, acs.unemployment_rate,\n", + " acs.bachelor_or_higher_pct, acs.broadband_subscription_pct,\n", + " acs.hispanic_latino_pct, acs.hispanic_latino_population,\n", + " acs.non_hispanic_white_pct, acs.non_hispanic_white_population,\n", + " acs.non_hispanic_black_pct, acs.non_hispanic_black_population,\n", + " acs.non_hispanic_asian_pct, acs.non_hispanic_asian_population,\n", + " acs.primary_industry, acs.primary_industry_pct,\n", + " ruca.primary_ruca, ruca.primary_ruca_description,\n", + " ruca.urban_core, ruca.urban_core_type,\n", + " ruca.pop_density as tract_pop_density,\n", + " ruca.land_area as tract_land_area_sqmi,\n", + " coalesce(en.eia_gen_count, 0) as eia_gen_count,\n", + " coalesce(en.eia_plant_count, 0) as eia_plant_count,\n", + " coalesce(en.eia_capacity_mw, 0) as eia_capacity_mw,\n", + " coalesce(en.eia_capacity_ng, 0) as eia_capacity_ng,\n", + " coalesce(en.eia_capacity_coal, 0) as eia_capacity_coal,\n", + " coalesce(en.eia_capacity_nuclear, 0) as eia_capacity_nuclear,\n", + " coalesce(en.eia_capacity_solar, 0) as eia_capacity_solar,\n", + " coalesce(en.eia_capacity_wind, 0) as eia_capacity_wind,\n", + " coalesce(en.eia_capacity_hydro, 0) as eia_capacity_hydro,\n", + " coalesce(en.eia_capacity_geothermal, 0) as eia_capacity_geothermal\n", + "from public.master_data_centers m\n", + "left join public.master_data_center_spatial_clusters c on c.master_id = m.master_id\n", + "left join public._dc_census_tract_acs_2024 acs on acs.geoid = m.geoid\n", + "left join public.ruca_codes_2020_tract ruca on ruca.tract_fips_20 = m.geoid\n", + "left join energy_nearby en on en.master_id = m.master_id\n", + "\"\"\"\n", + "\n", + "with get_conn() as conn:\n", + " joined_df = pd.read_sql(JOIN_SQL, conn)\n", + "\n", + "print(f'rows: {len(joined_df):,} cols: {joined_df.shape[1]}')\n", + "print('non-null geoid: ', joined_df['geoid'].notna().sum())\n", + "print('non-null cluster_id: ', joined_df['cluster_id'].notna().sum())\n", + "print('non-null primary_ruca: ', joined_df['primary_ruca'].notna().sum())\n", + "print('DCs with >=1 nearby gen: ', (joined_df['eia_gen_count'] > 0).sum())\n", + "print(f\"median nearby capacity: {joined_df['eia_capacity_mw'].median():,.0f} MW\")\n", + "print(f\" 90th percentile: {joined_df['eia_capacity_mw'].quantile(0.9):,.0f} MW\")\n", + "print(f\" max: {joined_df['eia_capacity_mw'].max():,.0f} MW\")\n", + "joined_df.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "## Quick demographic analysis\n", + "\n", + "The joined dataset has one row per data center, enriched with the demographics of its containing census tract. Note that multiple DCs can share a tract, so tract-level stats are weighted by DC count in these summaries (i.e. \"the average DC sits in a tract with...\" rather than \"the average DC-hosting tract has...\").\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "# Top-line demographic profile of the average DC's containing tract.\n", + "demo_cols = [\n", + " 'population', 'median_age', 'avg_household_size',\n", + " 'median_household_income', 'per_capita_income',\n", + " 'poverty_rate', 'unemployment_rate',\n", + " 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n", + " 'hispanic_latino_pct', 'non_hispanic_white_pct',\n", + " 'non_hispanic_black_pct', 'non_hispanic_asian_pct',\n", + "]\n", + "demo_cols = [c for c in demo_cols if c in joined_df.columns]\n", + "\n", + "summary = joined_df[demo_cols].agg(['count', 'mean', 'median', 'std', 'min', 'max']).round(2).T\n", + "summary.columns = ['n', 'mean', 'median', 'std', 'min', 'max']\n", + "\n", + "# US national benchmarks (ACS 5-yr ~2024) for context\n", + "benchmarks = {\n", + " 'median_household_income': 78_538,\n", + " 'per_capita_income': 43_313,\n", + " 'poverty_rate': 12.4,\n", + " 'unemployment_rate': 5.4,\n", + " 'bachelor_or_higher_pct': 35.0,\n", + " 'broadband_subscription_pct': 89.0,\n", + " 'hispanic_latino_pct': 19.5,\n", + " 'non_hispanic_white_pct': 58.4,\n", + " 'non_hispanic_black_pct': 12.1,\n", + " 'non_hispanic_asian_pct': 6.4,\n", + "}\n", + "summary['us_avg'] = pd.Series(benchmarks).reindex(summary.index)\n", + "summary['vs_us'] = (summary['mean'] - summary['us_avg']).round(2)\n", + "summary\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "# Geographic concentration: where are the data centers, and what do those places look like?\n", + "state_summary = (\n", + " joined_df.groupby('state', dropna=False)\n", + " .agg(\n", + " dc_count=('master_id', 'count'),\n", + " avg_power_mw=('power_mw', 'mean'),\n", + " total_power_mw=('power_mw', 'sum'),\n", + " median_hh_income=('median_household_income', 'median'),\n", + " median_poverty=('poverty_rate', 'median'),\n", + " median_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n", + " median_broadband_pct=('broadband_subscription_pct', 'median'),\n", + " median_pct_white=('non_hispanic_white_pct', 'median'),\n", + " median_pct_hispanic=('hispanic_latino_pct', 'median'),\n", + " median_pct_black=('non_hispanic_black_pct', 'median'),\n", + " )\n", + " .sort_values('dc_count', ascending=False)\n", + " .round(1)\n", + ")\n", + "print(f'{joined_df[\"state\"].nunique()} states/territories represented')\n", + "state_summary.head(15)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster vs. non-cluster: do DCs in spatial clusters sit in different demographic settings\n", + "# than isolated ones? (cluster_id is null/is_noise=True for unclustered DCs.)\n", + "joined_df['in_cluster'] = joined_df['cluster_id'].notna() & (joined_df['is_noise'] != True)\n", + "\n", + "compare_cols = [\n", + " 'median_household_income', 'per_capita_income',\n", + " 'poverty_rate', 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n", + " 'non_hispanic_white_pct', 'hispanic_latino_pct', 'non_hispanic_black_pct',\n", + " 'population', 'eia_gen_count',\n", + "]\n", + "compare_cols = [c for c in compare_cols if c in joined_df.columns]\n", + "\n", + "cluster_compare = (\n", + " joined_df.groupby('in_cluster')[compare_cols]\n", + " .median()\n", + " .round(1)\n", + " .T\n", + " .rename(columns={False: 'isolated', True: 'in_cluster'})\n", + ")\n", + "cluster_compare['delta'] = (cluster_compare['in_cluster'] - cluster_compare['isolated']).round(1)\n", + "print(f\"DCs in a cluster: {joined_df['in_cluster'].sum():,} isolated: {(~joined_df['in_cluster']).sum():,}\")\n", + "cluster_compare\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "# Quick visual sweep: distribution of key demographic features for DC tracts,\n", + "# with US-average reference lines for context.\n", + "import matplotlib.pyplot as plt\n", + "\n", + "panels = [\n", + " ('median_household_income', 78_538, 'Median household income (USD)'),\n", + " ('poverty_rate', 12.4, 'Poverty rate (%)'),\n", + " ('bachelor_or_higher_pct', 35.0, \"Bachelor's degree or higher (%)\"),\n", + " ('broadband_subscription_pct', 89.0, 'Broadband subscription (%)'),\n", + " ('non_hispanic_white_pct', 58.4, 'Non-Hispanic white (%)'),\n", + " ('hispanic_latino_pct', 19.5, 'Hispanic/Latino (%)'),\n", + "]\n", + "panels = [(c, b, lab) for c, b, lab in panels if c in joined_df.columns]\n", + "\n", + "fig, axes = plt.subplots(2, 3, figsize=(15, 8))\n", + "for ax, (col, bench, label) in zip(axes.ravel(), panels):\n", + " s = joined_df[col].dropna()\n", + " ax.hist(s, bins=40, color='steelblue', edgecolor='white', alpha=0.85)\n", + " ax.axvline(s.median(), color='darkorange', linestyle='-', lw=2, label=f'DC median = {s.median():.1f}')\n", + " ax.axvline(bench, color='firebrick', linestyle='--', lw=2, label=f'US avg = {bench}')\n", + " ax.set_title(label)\n", + " ax.legend(fontsize=8)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "## RUCA (urban / rural) analysis\n", + "\n", + "RUCA primary code key:\n", + "- **1**: Metropolitan area core\n", + "- **2**: Metropolitan area high commuting\n", + "- **3**: Metropolitan area low commuting\n", + "- **4–6**: Micropolitan area (small city + commuting tracts)\n", + "- **7–9**: Small town (core + commuting tracts)\n", + "- **10**: Rural area\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "# DC distribution across RUCA codes vs. the national baseline of all US tracts.\n", + "ruca_buckets = {\n", + " 1: 'Metro core', 2: 'Metro high-commute', 3: 'Metro low-commute',\n", + " 4: 'Micro core', 5: 'Micro high-commute', 6: 'Micro low-commute',\n", + " 7: 'Small town core', 8: 'Small town high-commute', 9: 'Small town low-commute',\n", + " 10: 'Rural',\n", + "}\n", + "\n", + "def ruca_band(r):\n", + " if pd.isna(r): return 'Unknown'\n", + " r = int(r)\n", + " if r <= 3: return 'Metropolitan'\n", + " if r <= 6: return 'Micropolitan'\n", + " if r <= 9: return 'Small town'\n", + " return 'Rural'\n", + "\n", + "dc_ruca = joined_df.copy()\n", + "dc_ruca['ruca_label'] = dc_ruca['primary_ruca'].map(ruca_buckets)\n", + "dc_ruca['ruca_band'] = dc_ruca['primary_ruca'].apply(ruca_band)\n", + "\n", + "# National baseline (share of US tracts in each band).\n", + "NATIONAL_SQL = \"\"\"\n", + "select\n", + " case\n", + " when primary_ruca between 1 and 3 then 'Metropolitan'\n", + " when primary_ruca between 4 and 6 then 'Micropolitan'\n", + " when primary_ruca between 7 and 9 then 'Small town'\n", + " when primary_ruca = 10 then 'Rural'\n", + " else 'Unknown'\n", + " end as ruca_band,\n", + " count(*) as tracts\n", + "from public.ruca_codes_2020_tract\n", + "group by 1\n", + "\"\"\"\n", + "with get_conn() as conn:\n", + " national_df = pd.read_sql(NATIONAL_SQL, conn)\n", + "national_df['tracts_pct'] = (100 * national_df['tracts'] / national_df['tracts'].sum()).round(1)\n", + "\n", + "dc_by_band = (\n", + " dc_ruca.groupby('ruca_band').size().rename('dcs').to_frame()\n", + " .assign(dcs_pct=lambda d: (100 * d['dcs'] / d['dcs'].sum()).round(1))\n", + ")\n", + "band_compare = dc_by_band.join(national_df.set_index('ruca_band')[['tracts', 'tracts_pct']])\n", + "band_compare['over_index'] = (band_compare['dcs_pct'] / band_compare['tracts_pct']).round(2)\n", + "print('Data centers vs. all US tracts, by RUCA band:')\n", + "print(band_compare.reindex(['Metropolitan', 'Micropolitan', 'Small town', 'Rural', 'Unknown']).fillna(0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": {}, + "outputs": [], + "source": [ + "# Fine-grained RUCA breakdown: DC count, median power, demographics, energy infra\n", + "# at each of the 10 RUCA codes.\n", + "ruca_profile = (\n", + " dc_ruca.groupby('primary_ruca', dropna=False)\n", + " .agg(\n", + " dcs=('master_id', 'count'),\n", + " median_power_mw=('power_mw', 'median'),\n", + " total_power_mw=('power_mw', 'sum'),\n", + " med_hh_income=('median_household_income', 'median'),\n", + " med_poverty=('poverty_rate', 'median'),\n", + " med_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n", + " med_pct_white=('non_hispanic_white_pct', 'median'),\n", + " med_pct_black=('non_hispanic_black_pct', 'median'),\n", + " med_pct_hispanic=('hispanic_latino_pct', 'median'),\n", + " med_pop_density=('tract_pop_density', 'median'),\n", + " med_eia_gens_50km=('eia_gen_count', 'median'),\n", + " )\n", + " .round(1)\n", + ")\n", + "ruca_profile.insert(0, 'description', ruca_profile.index.map(ruca_buckets))\n", + "print('Per-RUCA-code profile of data centers:')\n", + "ruca_profile\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot: DC count by RUCA band vs. national tract share.\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", + "\n", + "order = ['Metropolitan', 'Micropolitan', 'Small town', 'Rural']\n", + "plot_df = band_compare.reindex(order).fillna(0)\n", + "\n", + "ax = axes[0]\n", + "x = range(len(plot_df))\n", + "width = 0.38\n", + "ax.bar([i - width/2 for i in x], plot_df['dcs_pct'], width, label='Data centers', color='steelblue')\n", + "ax.bar([i + width/2 for i in x], plot_df['tracts_pct'], width, label='All US tracts', color='lightgray', edgecolor='gray')\n", + "ax.set_xticks(list(x))\n", + "ax.set_xticklabels(plot_df.index, rotation=15)\n", + "ax.set_ylabel('% of total')\n", + "ax.set_title('DC share vs. national tract share, by RUCA band')\n", + "ax.legend()\n", + "\n", + "ax = axes[1]\n", + "colors = ['firebrick' if v > 1 else 'steelblue' for v in plot_df['over_index']]\n", + "ax.barh(plot_df.index, plot_df['over_index'], color=colors)\n", + "ax.axvline(1.0, color='black', linestyle='--', lw=1)\n", + "ax.set_xlabel('Over-index (1.0 = at parity with national)')\n", + "ax.set_title('How much DCs over- or under-represent each RUCA band')\n", + "for i, v in enumerate(plot_df['over_index']):\n", + " ax.text(v, i, f' {v:.2f}x', va='center')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": {}, + "outputs": [], + "source": [ + "# The non-metro tail: who's building in rural / small-town / micropolitan tracts?\n", + "# These are often the most interesting builds (hyperscale greenfield, low-cost power).\n", + "nonmetro = dc_ruca[dc_ruca['ruca_band'].isin(['Rural', 'Small town', 'Micropolitan'])].copy()\n", + "\n", + "print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n", + "\n", + "# Top operators in non-metro tracts.\n", + "print('Top operators in non-metro tracts:')\n", + "top_ops = (\n", + " nonmetro.groupby('operator', dropna=False)\n", + " .agg(dcs=('master_id', 'count'),\n", + " total_power_mw=('power_mw', 'sum'),\n", + " median_power_mw=('power_mw', 'median'))\n", + " .sort_values('dcs', ascending=False)\n", + " .head(15)\n", + " .round(1)\n", + ")\n", + "print(top_ops, '\\n')\n", + "\n", + "# Top states in non-metro tracts.\n", + "print('Top states for non-metro DCs:')\n", + "top_states = (\n", + " nonmetro.groupby('state', dropna=False)\n", + " .agg(dcs=('master_id', 'count'),\n", + " total_power_mw=('power_mw', 'sum'),\n", + " med_pop_density=('tract_pop_density', 'median'))\n", + " .sort_values('dcs', ascending=False)\n", + " .head(10)\n", + " .round(1)\n", + ")\n", + "print(top_states, '\\n')\n", + "\n", + "# The biggest non-metro builds by power.\n", + "print('Largest non-metro DCs by stated power_mw:')\n", + "big_nonmetro = (\n", + " nonmetro.dropna(subset=['power_mw'])\n", + " .nlargest(15, 'power_mw')\n", + " [['name', 'operator', 'city', 'state', 'power_mw',\n", + " 'primary_ruca', 'primary_ruca_description', 'tract_pop_density']]\n", + " .reset_index(drop=True)\n", + ")\n", + "big_nonmetro\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "# power_mw coverage across the DC universe — most rows are null, which is why\n", + "# \"biggest non-metro by power\" surfaced only a handful.\n", + "coverage = (\n", + " dc_ruca.assign(has_power=dc_ruca['power_mw'].notna())\n", + " .groupby('ruca_band', dropna=False)\n", + " .agg(dcs=('master_id', 'count'),\n", + " with_power_mw=('has_power', 'sum'))\n", + ")\n", + "coverage['pct_with_power'] = (100 * coverage['with_power_mw'] / coverage['dcs']).round(1)\n", + "print('power_mw coverage by RUCA band:')\n", + "print(coverage)\n", + "print(f\"\\nOverall: {dc_ruca['power_mw'].notna().sum():,} / {len(dc_ruca):,} DCs have power_mw \"\n", + " f\"({100*dc_ruca['power_mw'].notna().mean():.1f}%)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": {}, + "outputs": [], + "source": [ + "# Now that EIA nameplate_capacity_mw is loaded, size non-metro DCs by the\n", + "# generation capacity within 50 km of each site (instead of the sparse power_mw).\n", + "# Re-derive non-metro slice from the updated joined_df.\n", + "nonmetro = joined_df[joined_df['primary_ruca'].isin([2,3,4,5,6,7,8,9,10])].copy()\n", + "nonmetro['ruca_band'] = nonmetro['primary_ruca'].apply(ruca_band)\n", + "\n", + "print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n", + "\n", + "# Largest non-metro DCs ranked by nearby grid capacity.\n", + "big_by_grid = (\n", + " nonmetro.sort_values('eia_capacity_mw', ascending=False)\n", + " .head(20)\n", + " [['name', 'operator', 'city', 'state',\n", + " 'primary_ruca', 'primary_ruca_description',\n", + " 'eia_capacity_mw', 'eia_capacity_nuclear', 'eia_capacity_hydro',\n", + " 'eia_capacity_ng', 'eia_capacity_coal',\n", + " 'eia_capacity_solar', 'eia_capacity_wind']]\n", + " .round(0)\n", + " .reset_index(drop=True)\n", + ")\n", + "print('Largest non-metro DCs by nearby grid capacity (50 km):')\n", + "big_by_grid\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperscalers' non-metro footprint, sized by surrounding grid capacity + fuel mix.\n", + "hyperscaler_map = {\n", + " 'Amazon Web Services': 'AWS', 'Amazon AWS': 'AWS', 'Amazon': 'AWS',\n", + " 'Microsoft': 'Microsoft',\n", + " 'Meta': 'Meta', 'Meta, Inc.': 'Meta', 'Facebook': 'Meta',\n", + " 'Google': 'Google', 'Alphabet': 'Google',\n", + " 'Apple': 'Apple',\n", + " 'Oracle': 'Oracle',\n", + " 'Yahoo': 'Yahoo',\n", + "}\n", + "nonmetro['op_group'] = nonmetro['operator'].map(hyperscaler_map).fillna(\n", + " nonmetro['operator'].where(nonmetro['operator'].notna(), 'Unknown')\n", + ")\n", + "\n", + "hyperscaler_view = (\n", + " nonmetro[nonmetro['op_group'].isin(['AWS','Microsoft','Meta','Google','Apple','Oracle','Yahoo','Unknown'])]\n", + " .groupby('op_group')\n", + " .agg(\n", + " dcs=('master_id', 'count'),\n", + " states=('state', 'nunique'),\n", + " sum_nearby_capacity_mw=('eia_capacity_mw', 'sum'),\n", + " median_nearby_capacity_mw=('eia_capacity_mw', 'median'),\n", + " sum_nearby_hydro_mw=('eia_capacity_hydro', 'sum'),\n", + " sum_nearby_nuclear_mw=('eia_capacity_nuclear', 'sum'),\n", + " sum_nearby_ng_mw=('eia_capacity_ng', 'sum'),\n", + " sum_nearby_solar_mw=('eia_capacity_solar', 'sum'),\n", + " sum_nearby_wind_mw=('eia_capacity_wind', 'sum'),\n", + " )\n", + " .sort_values('dcs', ascending=False)\n", + " .round(0)\n", + ")\n", + "print(\"Non-metro DCs by operator group, sized by aggregate nearby grid capacity:\")\n", + "hyperscaler_view\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ingest_eia_energy_layers.py b/ingest_eia_energy_layers.py index 3f6a8b3..e98465b 100644 --- a/ingest_eia_energy_layers.py +++ b/ingest_eia_energy_layers.py @@ -78,7 +78,13 @@ EIA_DATASETS = { # values must be requested explicitly. seds returns only id columns; the numeric # value column must be requested explicitly. EIA_DATASET_DATA_FIELDS = { - "electricity/operating-generator-capacity": ["latitude", "longitude"], + "electricity/operating-generator-capacity": [ + "latitude", + "longitude", + "nameplate-capacity-mw", + "net-summer-capacity-mw", + "net-winter-capacity-mw", + ], "electricity/facility-fuel": ["generation", "gross-generation"], "seds": ["value"], } @@ -897,6 +903,9 @@ def build_flat_tables(conn): properties->>'balancing-authority-name' as balancing_authority_name, latitude_raw as latitude, longitude_fixed as longitude, + nullif(properties->>'nameplate-capacity-mw', '')::double precision as nameplate_capacity_mw, + nullif(properties->>'net-summer-capacity-mw', '')::double precision as net_summer_capacity_mw, + nullif(properties->>'net-winter-capacity-mw', '')::double precision as net_winter_capacity_mw, properties as raw_properties from fixed """ diff --git a/output/data_center_demographic_ruca_energy_summary.md b/output/data_center_demographic_ruca_energy_summary.md new file mode 100644 index 0000000..d7d6fad --- /dev/null +++ b/output/data_center_demographic_ruca_energy_summary.md @@ -0,0 +1,222 @@ +# US Data Centers — Demographic, Urban-Rural & Energy Context Analysis + +**Date:** 2026-05-18 +**Notebook:** [cluster_analysis.ipynb](../cluster_analysis.ipynb) +**Universe:** 1,833 data centers in `public.master_data_centers`, joined to ACS-2024 demographics, USDA RUCA-2020 codes, and EIA operating-generator capacity (50 km radius, latest period 2026-02, status=OP). + +> **Update 2026-05-18**: 196 previously-null `state` values were backfilled from `geoid` (first 2 chars = state FIPS). All 1,833 DCs now have a state; all state-level numbers below reflect the corrected attribution. + +--- + +## Headline findings + +1. **DC tracts are richer, more educated, and more diverse than the US average.** Median household income $103,623 vs. national $78,538 (+32%); 49% bachelor's+ vs. 35% (+14 pp); poverty rate 7.2% vs. 12.4%. Non-Hispanic white share is *below* national (50% vs. 58%), driven by Asian-heavy (mean 13% vs. 6%) and Hispanic-significant tracts. +2. **The metro skew is more modest than expected: 1.11×.** 89% of DCs sit in metropolitan tracts, but 80% of *all* US tracts are metropolitan — so DCs are only slightly more concentrated than the underlying population distribution would predict. +3. **The non-metro tail is overwhelmingly hyperscale and Pacific Northwest.** Of 190 DCs outside metropolitan tracts (RUCA 4–10), AWS owns 67, Meta 22, Microsoft 10, Google 4, Yahoo 2 — combined 55% of the non-metro footprint. Oregon (86) and Washington (40) alone hold 66% of non-metro DCs, anchored to the Columbia River hydropower corridor. +4. **Clustered DCs are demographically distinct from isolated ones.** DCs in DBSCAN clusters (n=1,583) sit in tracts with $108K median income vs. $73K for isolated DCs (n=250) — a $35K gap. Clustered DCs are more educated (+18 pp bachelor's), more diverse (–25 pp non-Hispanic white), and embedded in much denser energy infrastructure (89 vs. 40 generators within 50 km). +5. **Microsoft co-locates with the largest US nuclear plant.** Microsoft's Goodyear, AZ campus has 14.6 GW of generation within 50 km — including 4.2 GW from Palo Verde Nuclear, the largest in the US. Despite the campus being in a RUCA-2 "Metro high-commute" tract (not strictly metro core), the surrounding grid is the densest by capacity in our analysis. + +--- + +## Dataset coverage and joins + +| Source table | Rows | Join key | Coverage | +|---|---|---|---| +| `master_data_centers` | 1,833 | base | — | +| `master_data_center_spatial_clusters` | 1,831 | `master_id` | 99.9% | +| `_dc_census_tract_acs_2024` | ~73,000 tracts | `geoid` | 1,807 matched (98.6%) | +| `ruca_codes_2020_tract` | 85,528 tracts | `tract_fips_20 = geoid` | 1,826 matched (99.6%) | +| `energy_eia_operating_generator_capacity_flat` | 4.7M rows | `ST_DWithin(geom, 50km)` | 1,831 DCs have ≥1 nearby gen | + +Energy aggregation uses period `2026-02` only with `status='OP'`, summing `nameplate_capacity_mw` for operating generators within 50 km of each DC. Note: EIA capacity columns were added to this table on 2026-05-17 — prior to that the `_flat` table had no MW values despite its name. + +--- + +## 1. Demographic profile of DC tracts (n=1,807 with non-null ACS) + +| Metric | DC tract (median) | DC tract (mean) | US avg | Δ mean vs. US | +|---|---:|---:|---:|---:| +| Median household income | $103,623 | $114,543 | $78,538 | **+$36,005** | +| Per-capita income | $51,283 | $55,725 | $43,313 | +$12,412 | +| Poverty rate | 7.2% | 10.1% | 12.4% | −2.3 pp | +| Unemployment rate | 3.5% | 4.4% | 5.4% | −1.0 pp | +| Bachelor's+ % | 49.3% | 46.2% | 35.0% | **+11.2 pp** | +| Broadband subscription % | 94.9% | 93.5% | 89.0% | +4.5 pp | +| Non-Hispanic white % | 50.2% | 51.0% | 58.4% | **−7.4 pp** | +| Hispanic / Latino % | 12.8% | 19.5% | 19.5% | 0.0 pp | +| Non-Hispanic Black % | 5.9% | 10.6% | 12.1% | −1.5 pp | +| Non-Hispanic Asian % | 6.4% | 13.4% | 6.4% | **+7.0 pp** | + +**Interpretation.** DC tracts skew toward high-income, highly-educated, technically connected, and racially diverse (specifically Asian-heavy). The race composition is interesting: DC tracts are *less* non-Hispanic white than national average, not more. This reflects DC siting in mixed-race coastal/exurban tech corridors (Bay Area, Northern Virginia, Seattle) rather than in homogeneous suburbs. + +**Data quality note.** `avg_household_size` contains sentinel-value pollution (min: −666,666,666), so the mean is unusable; the median (2.55) is sensible. + +--- + +## 2. Geographic concentration (top 15 states) + +| State | DC count | Total power_mw (where known) | Median HH income | Median bachelor's % | Median % white | Notes | +|---|---:|---:|---:|---:|---:|---| +| **VA** | **378** | 255 | $141,250 | 62.6% | 42.5% | Loudoun / DC-Alley dominance (20.6% of all US DCs) | +| TX | 162 | 597 | $88,228 | 46.2% | 32.0% | DFW + Austin + San Antonio | +| CA | 147 | 130 | $164,928 | 56.4% | 22.4% | Bay Area + LA basin | +| OR | 145 | 125 | $72,719 | 20.0% | 63.2% | Columbia River hydro corridor (rural) | +| OH | 103 | 135 | $128,875 | 47.0% | 74.5% | Columbus boom — fastest-rising market | +| WA | 93 | 70 | $91,623 | 21.9% | 40.3% | Quincy/Wenatchee + Seattle | +| AZ | 69 | 54 | $85,335 | 35.2% | 51.6% | Phoenix/Goodyear hyperscale | +| IA | 65 | 0 | $93,393 | 34.3% | 88.1% | 88% white (rural Midwest) | +| NJ | 62 | 98 | $147,321 | 59.4% | 32.9% | NYC-metro carrier hotels | +| IL | 61 | 128 | $96,191 | 52.9% | 52.0% | Chicago metro | +| GA | 50 | 241 | $101,176 | 51.4% | 31.6% | Atlanta + high-power rural builds | +| NY | 48 | 47 | $77,465 | 47.6% | 74.8% | NYC + upstate | +| NV | 41 | 0 | $93,409 | 31.2% | 34.6% | Reno + Las Vegas | +| TN | 32 | 0 | — | — | 54.8% | Nashville + Memphis (newly visible after state backfill) | +| NC | 31 | 56 | $82,708 | 44.7% | 59.6% | Charlotte + Catawba (nuclear-adjacent) | + +**Virginia alone holds 20.6% of all US DCs** (378 of 1,833), with the most affluent tract profile in the top 15 — a Loudoun County effect. The state backfill substantially elevated **Ohio (76 → 103)** and **Texas (135 → 162)**, pushing TX into the #2 slot. The previously-uncounted **Tennessee (32) now appears in the top 15**. + +Oregon and Washington tracts look notably different from the urban-heavy states (lower income, lower education, lower broadband, higher Hispanic share), reflecting their rural Columbia River siting. + +--- + +## 3. Spatially clustered DCs vs. isolated DCs + +DBSCAN cluster assignment from `master_data_center_spatial_clusters` (1,583 clustered, 250 isolated): + +| Metric (median) | Isolated | In cluster | Δ | +|---|---:|---:|---:| +| Median household income | $73,500 | $108,359 | **+$34,859** | +| Bachelor's+ % | 33.2 | 51.2 | **+18.0 pp** | +| Poverty rate | 11.6 | 6.9 | −4.7 pp | +| Non-Hispanic white % | 71.0 | 45.9 | **−25.1 pp** | +| EIA generators within 50 km | 40 | 89 | +49 | +| EIA capacity within 50 km (MW) | 2,176 | 3,300 | +1,125 | + +**Reading.** A clustered data center sits, at the median, in a tract that is ~$35K richer, 18 pp more educated, and 25 pp less non-Hispanic white than an isolated one — and is surrounded by twice as much energy infrastructure (and 50% more generation capacity). The isolated set looks like rural / small-town America (whiter, poorer, less educated); the clustered set looks like coastal exurban tech corridors. + +--- + +## 4. RUCA (urban-rural) distribution + +National baseline of all US tracts: 80% Metropolitan, 9% Micropolitan, 3% Small town, 8% Rural. + +| RUCA band | DCs | DC % | US tract % | Over-index | +|---|---:|---:|---:|---:| +| Metropolitan (1–3) | 1,636 | 89.3% | 80.1% | **1.11×** | +| Micropolitan (4–6) | 98 | 5.3% | 9.0% | 0.59× | +| Small town (7–9) | 15 | 0.8% | 2.9% | 0.28× | +| Rural (10) | 77 | 4.2% | 7.6% | 0.55× | +| Unknown / missed match | 7 | 0.4% | — | — | + +**Reading.** The metro skew is real but only mild — 1.11×. The eye-catching pattern is that **rural tracts (RUCA 10) hold more DCs than micropolitan or small-town combined**, because the hyperscale greenfield model deliberately bypasses small-city economies in favor of remote, cheap-power, low-population sites. + +### Per-RUCA-code drilldown + +| RUCA | Description | DCs | Median HH income | Median pop density | Median EIA gens (50km) | +|---:|---|---:|---:|---:|---:| +| 1 | Metro core | 1,425 | $110,333 | 1,859 / sq mi | 97 | +| 2 | Metro high-commute | 206 | $105,404 | 96 | 49 | +| 3 | Metro low-commute | 5 | $119,495 | 22 | 23 | +| 4 | Micropolitan core | 54 | $63,698 | 312 | 53 | +| 5 | Micropolitan high-commute | 22 | $72,465 | 191 | 51 | +| 6 | Micropolitan low-commute | 22 | $72,719 | 69 | 59 | +| 7 | Small town core | 14 | $87,522 | 2,336 | 40 | +| 8 | Small town high-commute | 1 | $69,074 | 36 | 41 | +| 10 | Rural area | 77 | $93,820 | 12 | 42 | + +**Two surprises:** +- Rural DCs (RUCA 10) sit in tracts with $93.8K median income — *higher* than micropolitan DCs ($63.7K–$72.7K). The rural DC sites are not poor rural America; they are wealthy-by-rural-standards counties chosen for power and water access. +- Micropolitan-core DCs (RUCA 4) have the *lowest* median income at $63.7K — the closest thing to "economic-development DC siting" in the dataset. + +--- + +## 5. Non-metro deep dive (190 DCs, RUCA 4–10) + +### Operators + +| Operator | Non-metro DCs | +|---|---:| +| Amazon Web Services | 67 | +| *(null operator)* | 50 | +| Meta | 20 (+ 2 as "Meta, Inc.") | +| Microsoft | 10 | +| Google | 4 | +| Rowan Green Data | 4 | +| NTT | 2 | +| Yahoo | 2 | +| Amazon AWS *(dupe)* | 2 | + +**The five hyperscalers (AWS, Meta, Microsoft, Google, Yahoo) account for 105 of 190 non-metro DCs (55%).** If the 50 null-operator rows skew similarly hyperscale (likely — they're disproportionately in OR/WA), the share is probably closer to 75%. + +### States (post-backfill) + +| State | Non-metro DCs | +|---|---:| +| Oregon | 86 | +| Washington | 40 | +| Texas | 9 | +| New Mexico | 7 | +| North Carolina | 6 | +| Pennsylvania | 5 | +| Wisconsin | 4 | +| New York | 3 | +| Tennessee | 3 | +| Georgia | 3 | + +**Oregon + Washington = 126 (66%) of all non-metro DCs.** This is the Columbia River basin: Prineville / Hermiston / Boardman / The Dalles (OR) and Quincy / East Wenatchee / Moses Lake (WA). The pull is hydroelectric power (cheap, low-carbon, abundant) and cool dry climate (free-cooling). + +The state backfill clarified the rest of the non-metro tail: **Texas (9)** and **Pennsylvania (5)** were previously hidden in the null bucket. These likely represent shale-gas-adjacent builds (Permian and Marcellus respectively). + +--- + +## 6. Energy footprint by operator (using EIA capacity within 50 km) + +Aggregated across DCs in RUCA 2–10 (i.e. anything outside dense metro core, n=401): + +| Operator | DCs | States | Total nearby capacity (GW) | Median per site (GW) | Hydro (GW) | Nuclear (GW) | NG (GW) | Solar (GW) | Wind (GW) | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| AWS | 93 | 5 | 397 | 4.8 | 66 | 2.5 | 201 | 4.6 | **114** | +| *(Unknown)* | 118 | 26 | 339 | 2.3 | 86 | 35 | 135 | 23 | 19 | +| Meta | 51 | 11 | 120 | 2.0 | 4.9 | 0 | 61 | 16 | 0.3 | +| Microsoft | 26 | 6 | 113 | 3.4 | 28 | **13** | 39 | 9.1 | 8.1 | +| Google | 31 | 5 | 100 | 3.9 | 14 | 0 | 43 | 3.6 | 4.7 | +| Apple | 5 | 2 | 4 | 0.6 | 1.6 | 0 | 1.1 | 0.9 | 0.4 | +| Yahoo | 2 | 1 | 7 | 3.5 | 6.4 | 0 | 0 | 0 | 0.7 | + +**Distinct hyperscaler strategies, visible in the fuel mix:** +- **AWS** has aggregated 114 GW of *wind* exposure across its 93 sites — by far the most renewable-coupled portfolio. Also heavy hydro (66 GW) from its OR/WA footprint and 201 GW of natural gas as baseline. +- **Microsoft** has the highest *nuclear* exposure (12.6 GW) — almost entirely from its Goodyear, AZ campuses near Palo Verde Nuclear. +- **Meta** has the most *solar* (16 GW) among the named hyperscalers, but minimal nuclear or wind — consistent with its New Mexico (Los Lunas) and Iowa builds. +- **Google** is split — moderate hydro and natural gas, modest renewables. + +### Largest non-metro grid neighborhoods (top sites by surrounding capacity) + +| DC | Operator | Location | Nearby capacity | Fuel highlight | +|---|---|---|---:|---| +| PHX70 / PHX-10 / PHX-11 | Microsoft (Azure) | Goodyear, AZ (RUCA 2) | 14.0–14.6 GW | **4.2 GW nuclear (Palo Verde)** + 6.4 GW gas + 2.2 GW solar | +| Stream PHX-1 | Stream Data Centers | Goodyear, AZ | 13.4 GW | Same Palo Verde / gas mix | +| T5 Charlotte Campus | T5 | Kings Mountain, NC (RUCA 6) | 12.9 GW | **4.9 GW nuclear** (Catawba) + 5.5 GW gas + 1.5 GW coal | +| Apple Maiden | Apple | Maiden, NC (RUCA 2) | 9.1 GW | 2.4 GW nuclear + 4.6 GW gas | +| Percheron DC | Rowan Green Data | (Texas, RUCA 10) | 6.7 GW | **3.0 GW wind** + 0.9 GW hydro + 2.4 GW gas | + +--- + +## Data quality flags + +1. ~~196 of 1,833 DCs (10.7%) have null `state`~~ **Resolved 2026-05-18** by backfilling from `geoid` first-2-chars (state FIPS). +2. **`master_data_centers.power_mw` is populated for only 108 / 1,833 DCs (5.9%).** Useless as a sizing metric without imputation or alternative source. Nearby EIA capacity is the more reliable proxy (used as the per-DC scale in this analysis). A grant-funded scrape of Baxtel / Data Center Map would close this gap. +3. **50 of 190 non-metro DCs (26%) have null `operator`.** Likely hyperscalers based on geography (OR/WA) but unconfirmed. +4. **Operator-string fragmentation**: "Meta" vs. "Meta, Inc."; "Amazon Web Services" vs. "Amazon AWS" vs. "amazon web services"; "Microsoft" vs. "Microsoft Azure". Inflates distinct-operator counts and fragments per-operator totals. +5. **`avg_household_size` column has sentinel pollution** (min: −666,666,666). Use median or filter before using. +6. **7 DCs failed RUCA join** — Puerto Rico tracts or non-US locations; trivial. +7. **EIA generator coordinates had a longitude sign error for 2008-01 through 2010-11** (~11K rows with positive lower-48 longitudes). The flat-table build at [ingest_eia_energy_layers.py:839-870](../ingest_eia_energy_layers.py#L839-L870) corrects this in `longitude` and `geom`, so spatial joins are unaffected. + +--- + +## Suggested next steps + +1. **Backfill `power_mw`** from Baxtel / Data Center Map (paid scrape — grant work). +2. **Operator-string deduplication** — collapse "Meta"/"Meta, Inc.", "AWS" variants, etc., before any per-operator analysis. +3. **Watershed (HUC8) join** — `public.watershed_huc8` is loaded but unused; would let us look at water stress overlap, particularly for the 190 non-metro DCs. +4. **State-level energy demand context** — `im3_state_projected_moderate_50` and `seds_state_msn_year` are loaded; joining these would let us compute "DC nearby capacity as share of state grid" rather than absolute MW. +5. **Opposition cases overlay** — `opposition_cases_geocoded` is loaded but unused; could test whether cluster-vs-isolated demographic differences predict community opposition.