{ "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "# Clustering Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "\n", "import pandas as pd\n", "import psycopg2\n", "\n", "\n", "def load_env_file(env_path: str = '.env') -> None:\n", " p = Path(env_path)\n", " if not p.exists():\n", " print(f'No {env_path} file found in {Path.cwd()}')\n", " return\n", " loaded = 0\n", " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", " line = raw_line.strip()\n", " if not line or line.startswith('#') or '=' not in line:\n", " continue\n", " key, value = line.split('=', 1)\n", " key = key.strip()\n", " value = value.strip().strip('\"').strip(\"'\")\n", " if key and key not in os.environ:\n", " os.environ[key] = value\n", " loaded += 1\n", " print(f'Loaded {loaded} env var(s) from {env_path}')\n", "\n", "\n", "def require_env(keys):\n", " missing = [k for k in keys if not os.getenv(k)]\n", " if missing:\n", " raise EnvironmentError(\n", " 'Missing required env vars: ' + ', '.join(missing) +\n", " '.\\nSet them in this notebook, or add them to a .env file.'\n", " )\n", "\n", "\n", "load_env_file('.env')\n", "\n", "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n", "require_env(required_keys)\n", "\n", "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", "\n", "\n", "def get_conn():\n", " return psycopg2.connect(\n", " host=os.environ['PGWEB_HOST'],\n", " port=os.environ['PGWEB_PORT'],\n", " user=os.environ['PGWEB_USER'],\n", " password=os.environ['PGWEB_PASSWORD'],\n", " dbname='data_centers',\n", " )\n", "\n", "\n", "with get_conn() as conn:\n", " with conn.cursor() as cur:\n", " cur.execute('select current_database(), current_user, version()')\n", " db, usr, ver = cur.fetchone()\n", " print('Connected to DB:', db)\n", " print('As user:', usr)\n", " print('Postgres:', ver.split(',')[0])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "# List tables in the database (user schemas only, excluding system + PostGIS internals).\n", "TABLES_SQL = \"\"\"\n", "select\n", " table_schema,\n", " table_name,\n", " table_type\n", "from information_schema.tables\n", "where table_schema not in ('pg_catalog', 'information_schema', 'tiger', 'tiger_data', 'topology')\n", "order by table_schema, table_name\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " tables_df = pd.read_sql(TABLES_SQL, conn)\n", "\n", "print(f'{len(tables_df):,} tables/views found')\n", "tables_df" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": {}, "outputs": [], "source": [ "# Inspect columns for the tables we want to join.\n", "INSPECT_TABLES = [\n", " 'master_data_centers',\n", " 'master_data_center_spatial_clusters',\n", " 'data_center_census_tracts_2024',\n", " '_dc_census_tract_acs_2024',\n", " 'energy_eia_operating_generator_capacity_flat',\n", "]\n", "\n", "COLS_SQL = \"\"\"\n", "select table_name, column_name, data_type\n", "from information_schema.columns\n", "where table_schema = 'public' and table_name = any(%s)\n", "order by table_name, ordinal_position\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " cols_df = pd.read_sql(COLS_SQL, conn, params=(INSPECT_TABLES,))\n", "\n", "for t in INSPECT_TABLES:\n", " sub = cols_df[cols_df['table_name'] == t]\n", " print(f'\\n=== {t} ({len(sub)} cols) ===')\n", " print(sub[['column_name', 'data_type']].to_string(index=False))\n" ] }, { "cell_type": "markdown", "id": "4", "metadata": {}, "source": [ "## Ingest RUCA codes\n", "\n", "USDA Rural-Urban Commuting Area (RUCA) codes classify each census tract on a 1–10 scale from \"Metropolitan area core\" (1) to \"Rural area\" (10), based on population density and commuting flows. Source file: `new/RUCA-codes-2020-tract.csv` (~85K tracts).\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "# Push RUCA codes CSV -> public.ruca_codes_2020_tract (idempotent: drops + recreates).\n", "from psycopg2.extras import execute_values\n", "\n", "RUCA_CSV = Path('new/RUCA-codes-2020-tract.csv')\n", "RUCA_TABLE = 'public.ruca_codes_2020_tract'\n", "\n", "# Map source CSV columns -> snake_case DB columns.\n", "COL_MAP = {\n", " 'TractFIPS23': 'tract_fips_23',\n", " 'CountyFIPS23': 'county_fips_23',\n", " 'CountyCode23': 'county_code_23',\n", " 'CountyName23': 'county_name_23',\n", " 'TractFIPS20': 'tract_fips_20',\n", " 'TractCode20': 'tract_code_20',\n", " 'TractName20': 'tract_name_20',\n", " 'CountyFIPS20': 'county_fips_20',\n", " 'CountyCode20': 'county_code_20',\n", " 'CountyName20': 'county_name_20',\n", " 'StateFIPS20': 'state_fips_20',\n", " 'StateName20': 'state_name_20',\n", " 'UrbanAreaCode20': 'urban_area_code_20',\n", " 'UrbanAreaName20': 'urban_area_name_20',\n", " 'UrbanCore': 'urban_core',\n", " 'UrbanCoreType': 'urban_core_type',\n", " 'PrimaryRUCA': 'primary_ruca',\n", " 'PrimaryRUCADescription': 'primary_ruca_description',\n", " 'PrimaryDestinationCode': 'primary_destination_code',\n", " 'PrimaryDestinationName': 'primary_destination_name',\n", " 'SecondaryRUCA': 'secondary_ruca',\n", " 'SecondaryRUCADescription': 'secondary_ruca_description',\n", " 'SecondaryDestinationCode': 'secondary_destination_code',\n", " 'SecondaryDestinationName': 'secondary_destination_name',\n", " 'Population': 'population',\n", " 'LandArea': 'land_area',\n", " 'PopDensity': 'pop_density',\n", "}\n", "\n", "# File is Latin-1 (has bytes like 0xf1 = ñ from Spanish place names).\n", "fips_str_cols = [c for c in COL_MAP if 'FIPS' in c or 'Code' in c]\n", "ruca_df = pd.read_csv(\n", " RUCA_CSV,\n", " dtype={c: str for c in fips_str_cols},\n", " encoding='latin-1',\n", ")\n", "ruca_df = ruca_df.rename(columns=COL_MAP)\n", "print(f'CSV rows: {len(ruca_df):,} cols: {ruca_df.shape[1]}')\n", "\n", "# PK is tract_fips_20 (always populated). Some tracts that existed in 2020 are gone\n", "# in 2023 (water-only tracts, dissolves), so tract_fips_23 can be null.\n", "DDL = f\"\"\"\n", "drop table if exists {RUCA_TABLE};\n", "create table {RUCA_TABLE} (\n", " tract_fips_23 text,\n", " county_fips_23 text,\n", " county_code_23 text,\n", " county_name_23 text,\n", " tract_fips_20 text primary key,\n", " tract_code_20 text,\n", " tract_name_20 text,\n", " county_fips_20 text,\n", " county_code_20 text,\n", " county_name_20 text,\n", " state_fips_20 text,\n", " state_name_20 text,\n", " urban_area_code_20 text,\n", " urban_area_name_20 text,\n", " urban_core smallint,\n", " urban_core_type text,\n", " primary_ruca smallint,\n", " primary_ruca_description text,\n", " primary_destination_code text,\n", " primary_destination_name text,\n", " secondary_ruca text,\n", " secondary_ruca_description text,\n", " secondary_destination_code text,\n", " secondary_destination_name text,\n", " population integer,\n", " land_area double precision,\n", " pop_density double precision\n", ");\n", "create index ruca_codes_2020_tract_state_idx on {RUCA_TABLE} (state_fips_20);\n", "create index ruca_codes_2020_tract_primary_ruca_idx on {RUCA_TABLE} (primary_ruca);\n", "create index ruca_codes_2020_tract_fips_23_idx on {RUCA_TABLE} (tract_fips_23);\n", "\"\"\"\n", "\n", "cols = list(COL_MAP.values())\n", "records = [tuple(None if pd.isna(v) else v for v in row) for row in ruca_df[cols].itertuples(index=False, name=None)]\n", "\n", "with get_conn() as conn:\n", " with conn.cursor() as cur:\n", " cur.execute(DDL)\n", " execute_values(\n", " cur,\n", " f\"insert into {RUCA_TABLE} ({', '.join(cols)}) values %s\",\n", " records,\n", " page_size=2000,\n", " )\n", " cur.execute(f'select count(*) from {RUCA_TABLE}')\n", " print(f'{RUCA_TABLE}: {cur.fetchone()[0]:,} rows')\n", " cur.execute(f\"\"\"\n", " select primary_ruca, count(*) as n\n", " from {RUCA_TABLE}\n", " group by 1 order by 1\n", " \"\"\")\n", " print('\\nRUCA distribution (all US tracts):')\n", " for ruca, n in cur.fetchall():\n", " print(f' {ruca}: {n:>6,}')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": {}, "outputs": [], "source": [ "# Sanity-check EIA generator coordinate ranges.\n", "# Prior note: EIA source data had a longitude sign error. Verify before spatial joining.\n", "EIA_COORD_SQL = \"\"\"\n", "select\n", " min(longitude) as lon_min, max(longitude) as lon_max,\n", " min(latitude) as lat_min, max(latitude) as lat_max,\n", " count(*) filter (where longitude > 0) as pos_lon_rows,\n", " count(*) filter (where longitude < 0) as neg_lon_rows,\n", " count(*) as total_rows\n", "from public.energy_eia_operating_generator_capacity_flat\n", "where longitude is not null and latitude is not null\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " eia_coord_df = pd.read_sql(EIA_COORD_SQL, conn)\n", "\n", "print(eia_coord_df.T)\n", "# For US plants we expect longitude in roughly [-180, -65]. If pos_lon_rows is large,\n", "# the sign-flip correction is still needed when spatial-joining.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "# Build the joined analysis dataset.\n", "#\n", "# Joins:\n", "# master_data_centers (m)\n", "# LEFT JOIN master_data_center_spatial_clusters (c) ON master_id\n", "# LEFT JOIN _dc_census_tract_acs_2024 (acs) ON m.geoid = acs.geoid\n", "# LEFT JOIN ruca_codes_2020_tract (ruca) ON m.geoid = ruca.tract_fips_20\n", "# LEFT JOIN (EIA operating generators within RADIUS_KM, latest period) aggregated per DC\n", "#\n", "# Energy aggregation: latest period, status='OP', sum of nameplate_capacity_mw\n", "# (and counts) within RADIUS_KM, broken out by fuel.\n", "\n", "RADIUS_KM = 50\n", "\n", "JOIN_SQL = f\"\"\"\n", "with latest_period as (\n", " select max(period) as period\n", " from public.energy_eia_operating_generator_capacity_flat\n", "),\n", "eia_latest as (\n", " select e.plant_id, e.generator_id, e.energy_source_code,\n", " e.nameplate_capacity_mw, e.geom\n", " from public.energy_eia_operating_generator_capacity_flat e\n", " join latest_period lp on e.period = lp.period\n", " where e.status = 'OP' and e.geom is not null\n", "),\n", "energy_nearby as (\n", " select\n", " m.master_id,\n", " count(*) as eia_gen_count,\n", " count(distinct plant_id) as eia_plant_count,\n", " sum(nameplate_capacity_mw) as eia_capacity_mw,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'NG') as eia_capacity_ng,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code in ('BIT','SUB','LIG','RC','ANT')) as eia_capacity_coal,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'NUC') as eia_capacity_nuclear,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'SUN') as eia_capacity_solar,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'WND') as eia_capacity_wind,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'WAT') as eia_capacity_hydro,\n", " sum(nameplate_capacity_mw) filter (where energy_source_code = 'GEO') as eia_capacity_geothermal\n", " from public.master_data_centers m\n", " join eia_latest e\n", " on st_dwithin(m.geom::geography, e.geom::geography, {RADIUS_KM} * 1000)\n", " where m.geom is not null\n", " group by m.master_id\n", ")\n", "select\n", " m.master_id, m.name, m.operator, m.city, m.state, m.country,\n", " m.power_mw, m.area_sqft, m.longitude, m.latitude, m.geoid,\n", " c.cluster_id, c.is_noise, c.nearest_neighbor_km,\n", " acs.population, acs.median_age, acs.households, acs.avg_household_size,\n", " acs.median_household_income, acs.per_capita_income,\n", " acs.poverty_rate, acs.unemployment_rate,\n", " acs.bachelor_or_higher_pct, acs.broadband_subscription_pct,\n", " acs.hispanic_latino_pct, acs.hispanic_latino_population,\n", " acs.non_hispanic_white_pct, acs.non_hispanic_white_population,\n", " acs.non_hispanic_black_pct, acs.non_hispanic_black_population,\n", " acs.non_hispanic_asian_pct, acs.non_hispanic_asian_population,\n", " acs.primary_industry, acs.primary_industry_pct,\n", " ruca.primary_ruca, ruca.primary_ruca_description,\n", " ruca.urban_core, ruca.urban_core_type,\n", " ruca.pop_density as tract_pop_density,\n", " ruca.land_area as tract_land_area_sqmi,\n", " coalesce(en.eia_gen_count, 0) as eia_gen_count,\n", " coalesce(en.eia_plant_count, 0) as eia_plant_count,\n", " coalesce(en.eia_capacity_mw, 0) as eia_capacity_mw,\n", " coalesce(en.eia_capacity_ng, 0) as eia_capacity_ng,\n", " coalesce(en.eia_capacity_coal, 0) as eia_capacity_coal,\n", " coalesce(en.eia_capacity_nuclear, 0) as eia_capacity_nuclear,\n", " coalesce(en.eia_capacity_solar, 0) as eia_capacity_solar,\n", " coalesce(en.eia_capacity_wind, 0) as eia_capacity_wind,\n", " coalesce(en.eia_capacity_hydro, 0) as eia_capacity_hydro,\n", " coalesce(en.eia_capacity_geothermal, 0) as eia_capacity_geothermal\n", "from public.master_data_centers m\n", "left join public.master_data_center_spatial_clusters c on c.master_id = m.master_id\n", "left join public._dc_census_tract_acs_2024 acs on acs.geoid = m.geoid\n", "left join public.ruca_codes_2020_tract ruca on ruca.tract_fips_20 = m.geoid\n", "left join energy_nearby en on en.master_id = m.master_id\n", "\"\"\"\n", "\n", "with get_conn() as conn:\n", " joined_df = pd.read_sql(JOIN_SQL, conn)\n", "\n", "print(f'rows: {len(joined_df):,} cols: {joined_df.shape[1]}')\n", "print('non-null geoid: ', joined_df['geoid'].notna().sum())\n", "print('non-null cluster_id: ', joined_df['cluster_id'].notna().sum())\n", "print('non-null primary_ruca: ', joined_df['primary_ruca'].notna().sum())\n", "print('DCs with >=1 nearby gen: ', (joined_df['eia_gen_count'] > 0).sum())\n", "print(f\"median nearby capacity: {joined_df['eia_capacity_mw'].median():,.0f} MW\")\n", "print(f\" 90th percentile: {joined_df['eia_capacity_mw'].quantile(0.9):,.0f} MW\")\n", "print(f\" max: {joined_df['eia_capacity_mw'].max():,.0f} MW\")\n", "joined_df.head()\n" ] }, { "cell_type": "markdown", "id": "8", "metadata": {}, "source": [ "## Quick demographic analysis\n", "\n", "The joined dataset has one row per data center, enriched with the demographics of its containing census tract. Note that multiple DCs can share a tract, so tract-level stats are weighted by DC count in these summaries (i.e. \"the average DC sits in a tract with...\" rather than \"the average DC-hosting tract has...\").\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9", "metadata": {}, "outputs": [], "source": [ "# Top-line demographic profile of the average DC's containing tract.\n", "demo_cols = [\n", " 'population', 'median_age', 'avg_household_size',\n", " 'median_household_income', 'per_capita_income',\n", " 'poverty_rate', 'unemployment_rate',\n", " 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n", " 'hispanic_latino_pct', 'non_hispanic_white_pct',\n", " 'non_hispanic_black_pct', 'non_hispanic_asian_pct',\n", "]\n", "demo_cols = [c for c in demo_cols if c in joined_df.columns]\n", "\n", "summary = joined_df[demo_cols].agg(['count', 'mean', 'median', 'std', 'min', 'max']).round(2).T\n", "summary.columns = ['n', 'mean', 'median', 'std', 'min', 'max']\n", "\n", "# US national benchmarks (ACS 5-yr ~2024) for context\n", "benchmarks = {\n", " 'median_household_income': 78_538,\n", " 'per_capita_income': 43_313,\n", " 'poverty_rate': 12.4,\n", " 'unemployment_rate': 5.4,\n", " 'bachelor_or_higher_pct': 35.0,\n", " 'broadband_subscription_pct': 89.0,\n", " 'hispanic_latino_pct': 19.5,\n", " 'non_hispanic_white_pct': 58.4,\n", " 'non_hispanic_black_pct': 12.1,\n", " 'non_hispanic_asian_pct': 6.4,\n", "}\n", "summary['us_avg'] = pd.Series(benchmarks).reindex(summary.index)\n", "summary['vs_us'] = (summary['mean'] - summary['us_avg']).round(2)\n", "summary\n" ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": {}, "outputs": [], "source": [ "# Geographic concentration: where are the data centers, and what do those places look like?\n", "state_summary = (\n", " joined_df.groupby('state', dropna=False)\n", " .agg(\n", " dc_count=('master_id', 'count'),\n", " avg_power_mw=('power_mw', 'mean'),\n", " total_power_mw=('power_mw', 'sum'),\n", " median_hh_income=('median_household_income', 'median'),\n", " median_poverty=('poverty_rate', 'median'),\n", " median_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n", " median_broadband_pct=('broadband_subscription_pct', 'median'),\n", " median_pct_white=('non_hispanic_white_pct', 'median'),\n", " median_pct_hispanic=('hispanic_latino_pct', 'median'),\n", " median_pct_black=('non_hispanic_black_pct', 'median'),\n", " )\n", " .sort_values('dc_count', ascending=False)\n", " .round(1)\n", ")\n", "print(f'{joined_df[\"state\"].nunique()} states/territories represented')\n", "state_summary.head(15)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "11", "metadata": {}, "outputs": [], "source": [ "# Cluster vs. non-cluster: do DCs in spatial clusters sit in different demographic settings\n", "# than isolated ones? (cluster_id is null/is_noise=True for unclustered DCs.)\n", "joined_df['in_cluster'] = joined_df['cluster_id'].notna() & (joined_df['is_noise'] != True)\n", "\n", "compare_cols = [\n", " 'median_household_income', 'per_capita_income',\n", " 'poverty_rate', 'bachelor_or_higher_pct', 'broadband_subscription_pct',\n", " 'non_hispanic_white_pct', 'hispanic_latino_pct', 'non_hispanic_black_pct',\n", " 'population', 'eia_gen_count',\n", "]\n", "compare_cols = [c for c in compare_cols if c in joined_df.columns]\n", "\n", "cluster_compare = (\n", " joined_df.groupby('in_cluster')[compare_cols]\n", " .median()\n", " .round(1)\n", " .T\n", " .rename(columns={False: 'isolated', True: 'in_cluster'})\n", ")\n", "cluster_compare['delta'] = (cluster_compare['in_cluster'] - cluster_compare['isolated']).round(1)\n", "print(f\"DCs in a cluster: {joined_df['in_cluster'].sum():,} isolated: {(~joined_df['in_cluster']).sum():,}\")\n", "cluster_compare\n" ] }, { "cell_type": "code", "execution_count": null, "id": "12", "metadata": {}, "outputs": [], "source": [ "# Quick visual sweep: distribution of key demographic features for DC tracts,\n", "# with US-average reference lines for context.\n", "import matplotlib.pyplot as plt\n", "\n", "panels = [\n", " ('median_household_income', 78_538, 'Median household income (USD)'),\n", " ('poverty_rate', 12.4, 'Poverty rate (%)'),\n", " ('bachelor_or_higher_pct', 35.0, \"Bachelor's degree or higher (%)\"),\n", " ('broadband_subscription_pct', 89.0, 'Broadband subscription (%)'),\n", " ('non_hispanic_white_pct', 58.4, 'Non-Hispanic white (%)'),\n", " ('hispanic_latino_pct', 19.5, 'Hispanic/Latino (%)'),\n", "]\n", "panels = [(c, b, lab) for c, b, lab in panels if c in joined_df.columns]\n", "\n", "fig, axes = plt.subplots(2, 3, figsize=(15, 8))\n", "for ax, (col, bench, label) in zip(axes.ravel(), panels):\n", " s = joined_df[col].dropna()\n", " ax.hist(s, bins=40, color='steelblue', edgecolor='white', alpha=0.85)\n", " ax.axvline(s.median(), color='darkorange', linestyle='-', lw=2, label=f'DC median = {s.median():.1f}')\n", " ax.axvline(bench, color='firebrick', linestyle='--', lw=2, label=f'US avg = {bench}')\n", " ax.set_title(label)\n", " ax.legend(fontsize=8)\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "id": "13", "metadata": {}, "source": [ "## RUCA (urban / rural) analysis\n", "\n", "RUCA primary code key:\n", "- **1**: Metropolitan area core\n", "- **2**: Metropolitan area high commuting\n", "- **3**: Metropolitan area low commuting\n", "- **4–6**: Micropolitan area (small city + commuting tracts)\n", "- **7–9**: Small town (core + commuting tracts)\n", "- **10**: Rural area\n" ] }, { "cell_type": "code", "execution_count": null, "id": "14", "metadata": {}, "outputs": [], "source": [ "# DC distribution across RUCA codes vs. the national baseline of all US tracts.\n", "ruca_buckets = {\n", " 1: 'Metro core', 2: 'Metro high-commute', 3: 'Metro low-commute',\n", " 4: 'Micro core', 5: 'Micro high-commute', 6: 'Micro low-commute',\n", " 7: 'Small town core', 8: 'Small town high-commute', 9: 'Small town low-commute',\n", " 10: 'Rural',\n", "}\n", "\n", "def ruca_band(r):\n", " if pd.isna(r): return 'Unknown'\n", " r = int(r)\n", " if r <= 3: return 'Metropolitan'\n", " if r <= 6: return 'Micropolitan'\n", " if r <= 9: return 'Small town'\n", " return 'Rural'\n", "\n", "dc_ruca = joined_df.copy()\n", "dc_ruca['ruca_label'] = dc_ruca['primary_ruca'].map(ruca_buckets)\n", "dc_ruca['ruca_band'] = dc_ruca['primary_ruca'].apply(ruca_band)\n", "\n", "# National baseline (share of US tracts in each band).\n", "NATIONAL_SQL = \"\"\"\n", "select\n", " case\n", " when primary_ruca between 1 and 3 then 'Metropolitan'\n", " when primary_ruca between 4 and 6 then 'Micropolitan'\n", " when primary_ruca between 7 and 9 then 'Small town'\n", " when primary_ruca = 10 then 'Rural'\n", " else 'Unknown'\n", " end as ruca_band,\n", " count(*) as tracts\n", "from public.ruca_codes_2020_tract\n", "group by 1\n", "\"\"\"\n", "with get_conn() as conn:\n", " national_df = pd.read_sql(NATIONAL_SQL, conn)\n", "national_df['tracts_pct'] = (100 * national_df['tracts'] / national_df['tracts'].sum()).round(1)\n", "\n", "dc_by_band = (\n", " dc_ruca.groupby('ruca_band').size().rename('dcs').to_frame()\n", " .assign(dcs_pct=lambda d: (100 * d['dcs'] / d['dcs'].sum()).round(1))\n", ")\n", "band_compare = dc_by_band.join(national_df.set_index('ruca_band')[['tracts', 'tracts_pct']])\n", "band_compare['over_index'] = (band_compare['dcs_pct'] / band_compare['tracts_pct']).round(2)\n", "print('Data centers vs. all US tracts, by RUCA band:')\n", "print(band_compare.reindex(['Metropolitan', 'Micropolitan', 'Small town', 'Rural', 'Unknown']).fillna(0))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "15", "metadata": {}, "outputs": [], "source": [ "# Fine-grained RUCA breakdown: DC count, median power, demographics, energy infra\n", "# at each of the 10 RUCA codes.\n", "ruca_profile = (\n", " dc_ruca.groupby('primary_ruca', dropna=False)\n", " .agg(\n", " dcs=('master_id', 'count'),\n", " median_power_mw=('power_mw', 'median'),\n", " total_power_mw=('power_mw', 'sum'),\n", " med_hh_income=('median_household_income', 'median'),\n", " med_poverty=('poverty_rate', 'median'),\n", " med_bachelor_pct=('bachelor_or_higher_pct', 'median'),\n", " med_pct_white=('non_hispanic_white_pct', 'median'),\n", " med_pct_black=('non_hispanic_black_pct', 'median'),\n", " med_pct_hispanic=('hispanic_latino_pct', 'median'),\n", " med_pop_density=('tract_pop_density', 'median'),\n", " med_eia_gens_50km=('eia_gen_count', 'median'),\n", " )\n", " .round(1)\n", ")\n", "ruca_profile.insert(0, 'description', ruca_profile.index.map(ruca_buckets))\n", "print('Per-RUCA-code profile of data centers:')\n", "ruca_profile\n" ] }, { "cell_type": "code", "execution_count": null, "id": "16", "metadata": {}, "outputs": [], "source": [ "# Plot: DC count by RUCA band vs. national tract share.\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", "\n", "order = ['Metropolitan', 'Micropolitan', 'Small town', 'Rural']\n", "plot_df = band_compare.reindex(order).fillna(0)\n", "\n", "ax = axes[0]\n", "x = range(len(plot_df))\n", "width = 0.38\n", "ax.bar([i - width/2 for i in x], plot_df['dcs_pct'], width, label='Data centers', color='steelblue')\n", "ax.bar([i + width/2 for i in x], plot_df['tracts_pct'], width, label='All US tracts', color='lightgray', edgecolor='gray')\n", "ax.set_xticks(list(x))\n", "ax.set_xticklabels(plot_df.index, rotation=15)\n", "ax.set_ylabel('% of total')\n", "ax.set_title('DC share vs. national tract share, by RUCA band')\n", "ax.legend()\n", "\n", "ax = axes[1]\n", "colors = ['firebrick' if v > 1 else 'steelblue' for v in plot_df['over_index']]\n", "ax.barh(plot_df.index, plot_df['over_index'], color=colors)\n", "ax.axvline(1.0, color='black', linestyle='--', lw=1)\n", "ax.set_xlabel('Over-index (1.0 = at parity with national)')\n", "ax.set_title('How much DCs over- or under-represent each RUCA band')\n", "for i, v in enumerate(plot_df['over_index']):\n", " ax.text(v, i, f' {v:.2f}x', va='center')\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "17", "metadata": {}, "outputs": [], "source": [ "# The non-metro tail: who's building in rural / small-town / micropolitan tracts?\n", "# These are often the most interesting builds (hyperscale greenfield, low-cost power).\n", "nonmetro = dc_ruca[dc_ruca['ruca_band'].isin(['Rural', 'Small town', 'Micropolitan'])].copy()\n", "\n", "print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n", "\n", "# Top operators in non-metro tracts.\n", "print('Top operators in non-metro tracts:')\n", "top_ops = (\n", " nonmetro.groupby('operator', dropna=False)\n", " .agg(dcs=('master_id', 'count'),\n", " total_power_mw=('power_mw', 'sum'),\n", " median_power_mw=('power_mw', 'median'))\n", " .sort_values('dcs', ascending=False)\n", " .head(15)\n", " .round(1)\n", ")\n", "print(top_ops, '\\n')\n", "\n", "# Top states in non-metro tracts.\n", "print('Top states for non-metro DCs:')\n", "top_states = (\n", " nonmetro.groupby('state', dropna=False)\n", " .agg(dcs=('master_id', 'count'),\n", " total_power_mw=('power_mw', 'sum'),\n", " med_pop_density=('tract_pop_density', 'median'))\n", " .sort_values('dcs', ascending=False)\n", " .head(10)\n", " .round(1)\n", ")\n", "print(top_states, '\\n')\n", "\n", "# The biggest non-metro builds by power.\n", "print('Largest non-metro DCs by stated power_mw:')\n", "big_nonmetro = (\n", " nonmetro.dropna(subset=['power_mw'])\n", " .nlargest(15, 'power_mw')\n", " [['name', 'operator', 'city', 'state', 'power_mw',\n", " 'primary_ruca', 'primary_ruca_description', 'tract_pop_density']]\n", " .reset_index(drop=True)\n", ")\n", "big_nonmetro\n" ] }, { "cell_type": "code", "execution_count": null, "id": "18", "metadata": {}, "outputs": [], "source": [ "# power_mw coverage across the DC universe — most rows are null, which is why\n", "# \"biggest non-metro by power\" surfaced only a handful.\n", "coverage = (\n", " dc_ruca.assign(has_power=dc_ruca['power_mw'].notna())\n", " .groupby('ruca_band', dropna=False)\n", " .agg(dcs=('master_id', 'count'),\n", " with_power_mw=('has_power', 'sum'))\n", ")\n", "coverage['pct_with_power'] = (100 * coverage['with_power_mw'] / coverage['dcs']).round(1)\n", "print('power_mw coverage by RUCA band:')\n", "print(coverage)\n", "print(f\"\\nOverall: {dc_ruca['power_mw'].notna().sum():,} / {len(dc_ruca):,} DCs have power_mw \"\n", " f\"({100*dc_ruca['power_mw'].notna().mean():.1f}%)\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, "outputs": [], "source": [ "# Now that EIA nameplate_capacity_mw is loaded, size non-metro DCs by the\n", "# generation capacity within 50 km of each site (instead of the sparse power_mw).\n", "# Re-derive non-metro slice from the updated joined_df.\n", "nonmetro = joined_df[joined_df['primary_ruca'].isin([2,3,4,5,6,7,8,9,10])].copy()\n", "nonmetro['ruca_band'] = nonmetro['primary_ruca'].apply(ruca_band)\n", "\n", "print(f'Non-metro DCs: {len(nonmetro):,}\\n')\n", "\n", "# Largest non-metro DCs ranked by nearby grid capacity.\n", "big_by_grid = (\n", " nonmetro.sort_values('eia_capacity_mw', ascending=False)\n", " .head(20)\n", " [['name', 'operator', 'city', 'state',\n", " 'primary_ruca', 'primary_ruca_description',\n", " 'eia_capacity_mw', 'eia_capacity_nuclear', 'eia_capacity_hydro',\n", " 'eia_capacity_ng', 'eia_capacity_coal',\n", " 'eia_capacity_solar', 'eia_capacity_wind']]\n", " .round(0)\n", " .reset_index(drop=True)\n", ")\n", "print('Largest non-metro DCs by nearby grid capacity (50 km):')\n", "big_by_grid\n" ] }, { "cell_type": "code", "execution_count": null, "id": "20", "metadata": {}, "outputs": [], "source": [ "# Hyperscalers' non-metro footprint, sized by surrounding grid capacity + fuel mix.\n", "hyperscaler_map = {\n", " 'Amazon Web Services': 'AWS', 'Amazon AWS': 'AWS', 'Amazon': 'AWS',\n", " 'Microsoft': 'Microsoft',\n", " 'Meta': 'Meta', 'Meta, Inc.': 'Meta', 'Facebook': 'Meta',\n", " 'Google': 'Google', 'Alphabet': 'Google',\n", " 'Apple': 'Apple',\n", " 'Oracle': 'Oracle',\n", " 'Yahoo': 'Yahoo',\n", "}\n", "nonmetro['op_group'] = nonmetro['operator'].map(hyperscaler_map).fillna(\n", " nonmetro['operator'].where(nonmetro['operator'].notna(), 'Unknown')\n", ")\n", "\n", "hyperscaler_view = (\n", " nonmetro[nonmetro['op_group'].isin(['AWS','Microsoft','Meta','Google','Apple','Oracle','Yahoo','Unknown'])]\n", " .groupby('op_group')\n", " .agg(\n", " dcs=('master_id', 'count'),\n", " states=('state', 'nunique'),\n", " sum_nearby_capacity_mw=('eia_capacity_mw', 'sum'),\n", " median_nearby_capacity_mw=('eia_capacity_mw', 'median'),\n", " sum_nearby_hydro_mw=('eia_capacity_hydro', 'sum'),\n", " sum_nearby_nuclear_mw=('eia_capacity_nuclear', 'sum'),\n", " sum_nearby_ng_mw=('eia_capacity_ng', 'sum'),\n", " sum_nearby_solar_mw=('eia_capacity_solar', 'sum'),\n", " sum_nearby_wind_mw=('eia_capacity_wind', 'sum'),\n", " )\n", " .sort_values('dcs', ascending=False)\n", " .round(0)\n", ")\n", "print(\"Non-metro DCs by operator group, sized by aggregate nearby grid capacity:\")\n", "hyperscaler_view\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.5" } }, "nbformat": 4, "nbformat_minor": 5 }