diff --git a/.dropboxignore b/.dropboxignore index 84e5620..5f968ba 100644 --- a/.dropboxignore +++ b/.dropboxignore @@ -5,4 +5,5 @@ __pycache__/ _pycache__/ new/ internet_cables/ -.claude/ \ No newline at end of file +.claude/ +.zip diff --git a/.gitignore b/.gitignore index b809d00..0885082 100644 --- a/.gitignore +++ b/.gitignore @@ -77,4 +77,14 @@ Thumbs.db # Data csv, json, etc. new/ -internet_cables/ \ No newline at end of file +internet_cables/ +data/ + +# Compressed files +*.zip +*.tar.gz +*.tar.bz2 +*.tar.xz +*.gz +*.bz2 +*.xz \ No newline at end of file diff --git a/historical_climate_data_centers.ipynb b/historical_climate_data_centers.ipynb index 310915b..8cf8852 100644 --- a/historical_climate_data_centers.ipynb +++ b/historical_climate_data_centers.ipynb @@ -5,31 +5,29 @@ "id": "0", "metadata": {}, "source": [ - "# Open-Meteo Historical Climate for Master Data Centers\n", + "# Historical Climate for Master Data Centers (Daymet + gridMET)\n", "\n", - "This notebook builds or refreshes `public.data_center_open_meteo_historical_climate`, a one-row-per-data-center climate summary table keyed by `master_id`.\n", + "This notebook builds or refreshes `public.data_center_historical_climate`, a one-row-per-data-center climate summary keyed by `master_id`.\n", "\n", - "Design goals:\n", - "- Joinable to `public.master_data_centers` through `master_id`.\n", - "- Rerunnable as new data centers are added to the master table.\n", - "- Uses the same `.env` and `PGWEB_*` connection pattern as `spatial_clustering_master_data_centers.ipynb`.\n", - "- Fetches metrics available from the Open-Meteo Historical Weather API now, and leaves explicit placeholders for NOAA/FEMA/smoke additions later.\n", + "**Source switch on 2026-05-20.** Previously this notebook used Open-Meteo's archive API, but a 30-year × 9-variable request consumes ~7,000 of the free 10,000 daily call budget, limiting the run to ~1 site/day. Two free, generous federal sources replace it:\n", "\n", - "Open-Meteo coverage in this notebook:\n", + "- **Daymet v4 R1** (ORNL DAAC): daily 1 km gridded `tmin`, `tmax`, `prcp`, `vp` (vapor pressure) for North America, 1980–present. Pulled via the single-pixel REST API.\n", + "- **gridMET / METDATA** (U. Idaho): daily 4 km gridded daily-mean wind speed (`vs`) for CONUS, 1979–present. Pulled via THREDDS NetCDF Subset Service as CSV (no `xarray` dependency).\n", "\n", - "| Requested variable | Notebook field(s) | Source status |\n", + "Climate normal period: **1991–2020** (standard WMO baseline).\n", + "\n", + "| Metric | Field(s) | Source / derivation |\n", "| --- | --- | --- |\n", - "| Mean annual temperature | `mean_annual_temperature_c` | Open-Meteo |\n", - "| Mean summer temperature | `mean_summer_temperature_c` | Open-Meteo |\n", - "| Wet bulb temperature | `mean_wet_bulb_temperature_c`, `max_wet_bulb_temperature_c`, `extreme_wet_bulb_days` | Open-Meteo |\n", - "| Relative humidity | `mean_relative_humidity_pct` | Open-Meteo |\n", - "| Cooling degree days | `cooling_degree_days_c`, `annual_cooling_degree_days_c_mean` | Derived from Open-Meteo |\n", - "| Extreme heat days | `extreme_heat_days`, `annual_extreme_heat_days_mean` | Derived from Open-Meteo |\n", - "| Diurnal temperature range | `mean_diurnal_temperature_range_c` | Derived from Open-Meteo |\n", - "| Precipitation variability | `precipitation_total_mm`, `annual_precipitation_cv`, `wet_day_precipitation_p95_mm` | Derived from Open-Meteo |\n", - "| Drought severity index | `drought_severity_index_source_status` | External source needed |\n", - "| Windstorm frequency | `windstorm_days`, `annual_windstorm_days_mean` | Open-Meteo proxy using wind gust threshold |\n", - "| Wildfire smoke exposure | `wildfire_smoke_exposure_source_status` | External source needed |\n" + "| Mean annual / summer temperature | `mean_annual_temperature_c`, `mean_summer_temperature_c` | Daymet `(tmin + tmax) / 2` |\n", + "| Max / min daily temperature | `max_daily_temperature_c`, `min_daily_temperature_c` | Daymet `tmax`, `tmin` |\n", + "| Diurnal temperature range | `mean_diurnal_temperature_range_c` | Daymet `tmax − tmin` |\n", + "| Relative humidity | `mean_relative_humidity_pct` | Derived `100 × vp / e_sat(T)` via Magnus formula |\n", + "| Wet bulb temperature | `mean_wet_bulb_temperature_c`, `max_wet_bulb_temperature_c`, `extreme_wet_bulb_days` | Stull (2011) approximation from T + derived RH |\n", + "| Cooling degree days | `cooling_degree_days_c`, `annual_cooling_degree_days_c_mean` | Σ max(0, T_mean − 18.3 °C) |\n", + "| Extreme heat days | `extreme_heat_days`, `annual_extreme_heat_days_mean` | Count days with `tmax ≥ 35 °C` |\n", + "| Precipitation | `precipitation_total_mm`, `annual_precipitation_mm_mean`, `annual_precipitation_cv`, `wet_day_precipitation_p95_mm` | Daymet `prcp` aggregated |\n", + "| Wind | `mean_wind_speed_ms`, `max_daily_mean_wind_speed_ms`, `sustained_wind_days`, `annual_sustained_wind_days_mean` | gridMET `vs` (daily-mean wind). **Sustained-wind day = daily mean ≥ 8 m/s** — this is *not* the same as the old Open-Meteo gust threshold; see notes at end. |\n", + "| Drought severity, wildfire smoke | `*_source_status` columns | External sources needed (USDM, gridMET PDSI, NOAA HMS) |\n" ] }, { @@ -40,11 +38,13 @@ "outputs": [], "source": [ "import json\n", + "import math\n", "import os\n", "import time\n", "import urllib.error\n", "import urllib.parse\n", "import urllib.request\n", + "from io import StringIO\n", "from pathlib import Path\n", "\n", "import numpy as np\n", @@ -72,7 +72,6 @@ " if not p.exists():\n", " print(f'No {env_path} file found in {Path.cwd()}')\n", " return\n", - "\n", " loaded = 0\n", " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", " line = raw_line.strip()\n", @@ -101,8 +100,8 @@ "require_env(required_keys)\n", "\n", "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", - "MASTER_TABLE = 'public.master_data_centers'\n", - "CLIMATE_TABLE = 'public.data_center_open_meteo_historical_climate'\n", + "MASTER_TABLE = 'public.master_data_centers'\n", + "CLIMATE_TABLE = 'public.data_center_historical_climate'\n", "\n", "\n", "def get_conn():\n", @@ -111,7 +110,7 @@ " port=os.environ['PGWEB_PORT'],\n", " user=os.environ['PGWEB_USER'],\n", " password=os.environ['PGWEB_PASSWORD'],\n", - " dbname=\"data_centers\",\n", + " dbname='data_centers',\n", " )\n", "\n", "\n", @@ -136,9 +135,11 @@ "source": [ "## Parameters\n", "\n", - "The default climate normal period is 1991-2020. Open-Meteo can go farther back, but this period is a standard baseline and keeps each API response moderate.\n", + "Climate normal period defaults to **1991–2020**, matching the current WMO baseline. Both Daymet and gridMET cover this range fully.\n", "\n", - "Set `REFRESH_EXISTING = True` when you want to recompute existing rows after changing thresholds, variables, dates, or source model. Leave it `False` to only pick up new `master_id` values added since the last run.\n" + "- `REFRESH_EXISTING=True` recomputes existing rows (e.g. after a threshold change).\n", + "- `INCLUDE_WIND=False` skips the per-site gridMET pass; wind columns stay null and `wind_status='pending_gridmet'`. Useful for a fast first pass since Daymet alone is one HTTP call per site, while gridMET is ~30 (one per year).\n", + "- `MAX_POINTS=3` for testing, `None` for all eligible sites.\n" ] }, { @@ -148,43 +149,45 @@ "metadata": {}, "outputs": [], "source": [ - "OPEN_METEO_ARCHIVE_URL = 'https://archive-api.open-meteo.com/v1/archive'\n", - "OPEN_METEO_MODEL = 'era5'\n", + "# ---- Daymet (T, precip, vapor pressure) ----\n", + "DAYMET_API_URL = 'https://daymet.ornl.gov/single-pixel/api/data'\n", + "DAYMET_VARIABLES = ['tmin', 'tmax', 'prcp', 'vp']\n", + "DAYMET_DATASET_VERSION = 'daymet_v4r1'\n", "\n", + "# ---- gridMET (daily-mean wind) ----\n", + "GRIDMET_NCSS_URL_TEMPLATE = 'https://thredds.northwestknowledge.net/thredds/ncss/MET/vs/vs_{year}.nc'\n", + "GRIDMET_VARIABLE = 'wind_speed'\n", + "GRIDMET_SCALE_FACTOR = 0.1 # NCSS CSV returns packed short ints; multiply to get m/s\n", + "GRIDMET_DATASET_VERSION = 'gridmet_vs'\n", + "\n", + "# ---- Period ----\n", "START_DATE = '1991-01-01'\n", - "END_DATE = '2020-12-31'\n", - "TIMEZONE = 'UTC'\n", + "END_DATE = '2020-12-31'\n", + "START_YEAR = int(START_DATE[:4])\n", + "END_YEAR = int(END_DATE[:4])\n", "\n", + "# ---- Run control ----\n", "REFRESH_EXISTING = False\n", - "DRY_RUN = False\n", - "MAX_POINTS = None # use a small integer for testing, or None for all eligible data centers\n", - "BATCH_SIZE = 1 # Public Open-Meteo archive limits are tight for 30-year daily histories\n", - "REQUEST_SLEEP_SECONDS = 70.0\n", + "INCLUDE_WIND = True\n", + "DRY_RUN = False\n", + "MAX_POINTS = None # small int for testing; None for all eligible\n", + "REQUEST_SLEEP_SECONDS = 0.5 # politeness pause between sites\n", "REQUEST_TIMEOUT_SECONDS = 90\n", - "MAX_RETRIES = 8\n", - "RATE_LIMIT_SLEEP_SECONDS = 120\n", + "MAX_RETRIES = 5\n", "\n", - "CDD_BASE_C = 18.3\n", - "EXTREME_HEAT_THRESHOLD_C = 35.0\n", + "# ---- Metric thresholds ----\n", + "CDD_BASE_C = 18.3\n", + "EXTREME_HEAT_THRESHOLD_C = 35.0\n", "EXTREME_WET_BULB_THRESHOLD_C = 28.0\n", - "WINDSTORM_GUST_THRESHOLD_MS = 17.2 # roughly 38.5 mph; adjust for your risk definition\n", - "WET_DAY_THRESHOLD_MM = 1.0\n", + "SUSTAINED_WIND_THRESHOLD_MS = 8.0 # ~18 mph daily-mean wind. NOT a gust threshold.\n", + "WET_DAY_THRESHOLD_MM = 1.0\n", "\n", - "DAILY_VARIABLES = [\n", - " 'temperature_2m_mean',\n", - " 'temperature_2m_max',\n", - " 'temperature_2m_min',\n", - " 'relative_humidity_2m_mean',\n", - " 'wet_bulb_temperature_2m_mean',\n", - " 'wet_bulb_temperature_2m_max',\n", - " 'precipitation_sum',\n", - " 'wind_gusts_10m_max',\n", - " 'soil_moisture_0_to_100cm_mean',\n", - "]\n", - "\n", - "print(f'Period: {START_DATE} to {END_DATE}')\n", - "print(f'Target table: {CLIMATE_TABLE}')\n", - "print('Refresh existing rows:', REFRESH_EXISTING)\n" + "print(f'Period: {START_DATE} to {END_DATE}')\n", + "print(f'Target table: {CLIMATE_TABLE}')\n", + "print(f'Daymet vars: {DAYMET_VARIABLES}')\n", + "print(f'Include wind: {INCLUDE_WIND} (gridMET sustained-wind threshold = {SUSTAINED_WIND_THRESHOLD_MS} m/s daily mean)')\n", + "print(f'Refresh: {REFRESH_EXISTING}')\n", + "print(f'Max sites: {MAX_POINTS}')\n" ] }, { @@ -215,52 +218,67 @@ " latitude double precision not null,\n", " geom geometry(Point, 4326),\n", "\n", - " open_meteo_model text not null,\n", + " daymet_dataset_version text not null,\n", + " gridmet_dataset_version text,\n", " climate_period_start date not null,\n", - " climate_period_end date not null,\n", - " api_timezone text not null,\n", - " api_grid_latitude double precision,\n", - " api_grid_longitude double precision,\n", - " api_elevation_m double precision,\n", - " api_utc_offset_seconds integer,\n", - " observation_days integer not null,\n", + " climate_period_end date not null,\n", + " daymet_tile integer,\n", + " daymet_elevation_m double precision,\n", + " daymet_grid_x_m double precision,\n", + " daymet_grid_y_m double precision,\n", + " gridmet_grid_latitude double precision,\n", + " gridmet_grid_longitude double precision,\n", + " observation_days integer not null,\n", " observation_years integer not null,\n", - " api_generationtime_ms double precision,\n", "\n", - " mean_annual_temperature_c double precision,\n", - " mean_summer_temperature_c double precision,\n", - " max_daily_temperature_c double precision,\n", - " mean_wet_bulb_temperature_c double precision,\n", - " max_wet_bulb_temperature_c double precision,\n", - " mean_relative_humidity_pct double precision,\n", - " cooling_degree_days_c double precision,\n", - " annual_cooling_degree_days_c_mean double precision,\n", - " extreme_heat_days integer,\n", - " annual_extreme_heat_days_mean double precision,\n", - " extreme_wet_bulb_days integer,\n", + " -- temperature\n", + " mean_annual_temperature_c double precision,\n", + " mean_summer_temperature_c double precision,\n", + " max_daily_temperature_c double precision,\n", + " min_daily_temperature_c double precision,\n", " mean_diurnal_temperature_range_c double precision,\n", - " precipitation_total_mm double precision,\n", - " annual_precipitation_mm_mean double precision,\n", - " annual_precipitation_cv double precision,\n", - " wet_day_precipitation_p95_mm double precision,\n", - " windstorm_days integer,\n", - " annual_windstorm_days_mean double precision,\n", - " max_wind_gust_ms double precision,\n", - " mean_soil_moisture_0_to_100cm double precision,\n", "\n", - " drought_severity_index_source_status text not null default 'needs_external_source',\n", + " -- humidity / wet-bulb (derived from Daymet T + vp)\n", + " mean_relative_humidity_pct double precision,\n", + " mean_wet_bulb_temperature_c double precision,\n", + " max_wet_bulb_temperature_c double precision,\n", + " extreme_wet_bulb_days integer,\n", + "\n", + " -- cooling load / heat extremes\n", + " cooling_degree_days_c double precision,\n", + " annual_cooling_degree_days_c_mean double precision,\n", + " extreme_heat_days integer,\n", + " annual_extreme_heat_days_mean double precision,\n", + "\n", + " -- precipitation\n", + " precipitation_total_mm double precision,\n", + " annual_precipitation_mm_mean double precision,\n", + " annual_precipitation_cv double precision,\n", + " wet_day_precipitation_p95_mm double precision,\n", + "\n", + " -- wind (gridMET daily-mean, NOT gust)\n", + " mean_wind_speed_ms double precision,\n", + " max_daily_mean_wind_speed_ms double precision,\n", + " sustained_wind_days integer,\n", + " annual_sustained_wind_days_mean double precision,\n", + " wind_status text not null default 'pending_gridmet',\n", + "\n", + " -- external-source placeholders\n", + " drought_severity_index_source_status text not null default 'needs_external_source',\n", " wildfire_smoke_exposure_source_status text not null default 'needs_external_source',\n", - " open_meteo_daily_variables text[] not null,\n", - " open_meteo_url text,\n", + "\n", + " -- provenance\n", + " daymet_variables text[] not null,\n", + " daymet_url text,\n", " fetched_at timestamptz not null default now(),\n", " updated_at timestamptz not null default now()\n", ");\n", "\n", - "create index if not exists data_center_open_meteo_hist_geom_gix\n", + "create index if not exists data_center_hist_climate_geom_gix\n", " on {CLIMATE_TABLE} using gist (geom);\n", - "create index if not exists data_center_open_meteo_hist_state_idx\n", + "create index if not exists data_center_hist_climate_state_idx\n", " on {CLIMATE_TABLE} (state);\n", - "create index if not exists data_center_open_meteo_hist_period_idx\n", + "create index if not exists data_center_hist_climate_period_idx\n", " on {CLIMATE_TABLE} (climate_period_start, climate_period_end);\n", "'''\n", "\n", @@ -289,20 +307,19 @@ "source": [ "def load_target_points(refresh_existing=False, max_points=None):\n", " exists_filter = ''\n", + " params = []\n", " if not refresh_existing:\n", " exists_filter = f'''\n", " and not exists (\n", " select 1\n", " from {CLIMATE_TABLE} c\n", " where c.master_id = m.master_id\n", - " and c.open_meteo_model = %s\n", + " and c.daymet_dataset_version = %s\n", " and c.climate_period_start = %s::date\n", - " and c.climate_period_end = %s::date\n", + " and c.climate_period_end = %s::date\n", " )\n", " '''\n", - " params = [OPEN_METEO_MODEL, START_DATE, END_DATE]\n", - " else:\n", - " params = []\n", + " params = [DAYMET_DATASET_VERSION, START_DATE, END_DATE]\n", "\n", " limit_sql = ''\n", " if max_points is not None:\n", @@ -311,20 +328,16 @@ "\n", " query = f'''\n", " select\n", - " m.master_id,\n", - " m.source,\n", - " m.name,\n", - " m.operator,\n", - " m.city,\n", - " m.state,\n", - " m.country,\n", - " m.longitude,\n", - " m.latitude,\n", - " ST_AsText(m.geom) as geom_wkt\n", + " m.master_id, m.source, m.name, m.operator,\n", + " m.city, m.state, m.country,\n", + " m.longitude, m.latitude\n", " from {MASTER_TABLE} m\n", " where m.longitude is not null\n", - " and m.latitude is not null\n", + " and m.latitude is not null\n", " and m.geom is not null\n", + " -- Daymet covers North America bounding box\n", + " and m.longitude between -178 and -52\n", + " and m.latitude between 14 and 83\n", " {exists_filter}\n", " order by m.master_id\n", " {limit_sql}\n", @@ -343,7 +356,13 @@ "id": "9", "metadata": {}, "source": [ - "## Fetch and Summarize Open-Meteo Daily Data" + "## Fetchers and derived-metric helpers\n", + "\n", + "`fetch_daymet` issues one single-pixel REST call for the full 30-year period. `fetch_gridmet_wind` issues one NCSS CSV call per year (Daymet has no equivalent multi-year aggregated endpoint we can rely on; per-year fetches keep each response small).\n", + "\n", + "Wet bulb is derived in two steps:\n", + "1. **RH from Daymet vp** using the Magnus saturation-vapor-pressure formula.\n", + "2. **Wet bulb from T + RH** using the Stull (2011) approximation (valid 5 % ≤ RH ≤ 99 %, −20 °C ≤ T ≤ +50 °C).\n" ] }, { @@ -353,225 +372,275 @@ "metadata": {}, "outputs": [], "source": [ - "def format_coordinate_param(values):\n", - " if np.isscalar(values):\n", - " return f'{float(values):.6f}'\n", - " return ','.join(f'{float(value):.6f}' for value in values)\n", - "\n", - "\n", - "def build_open_meteo_url(latitude, longitude):\n", - " params = {\n", - " 'latitude': format_coordinate_param(latitude),\n", - " 'longitude': format_coordinate_param(longitude),\n", - " 'start_date': START_DATE,\n", - " 'end_date': END_DATE,\n", - " 'daily': ','.join(DAILY_VARIABLES),\n", - " 'temperature_unit': 'celsius',\n", - " 'wind_speed_unit': 'ms',\n", - " 'precipitation_unit': 'mm',\n", - " 'timezone': TIMEZONE,\n", - " 'models': OPEN_METEO_MODEL,\n", - " }\n", - " return OPEN_METEO_ARCHIVE_URL + '?' + urllib.parse.urlencode(params, safe=',')\n", - "\n", - "\n", - "def retry_after_seconds(exc):\n", - " value = exc.headers.get('Retry-After') if getattr(exc, 'headers', None) else None\n", - " if value:\n", - " try:\n", - " return max(1, int(value))\n", - " except ValueError:\n", - " pass\n", - " return None\n", - "\n", - "\n", - "def fetch_json(url):\n", + "def http_get(url, max_retries=MAX_RETRIES, timeout=REQUEST_TIMEOUT_SECONDS):\n", " last_error = None\n", - " for attempt in range(1, MAX_RETRIES + 1):\n", + " for attempt in range(1, max_retries + 1):\n", " try:\n", - " request = urllib.request.Request(url, headers={'User-Agent': 'data-center-climate-notebook/1.0'})\n", - " with urllib.request.urlopen(request, timeout=REQUEST_TIMEOUT_SECONDS) as response:\n", - " return json.loads(response.read().decode('utf-8'))\n", + " req = urllib.request.Request(url, headers={'User-Agent': 'data-center-climate-notebook/1.1'})\n", + " with urllib.request.urlopen(req, timeout=timeout) as resp:\n", + " return resp.read().decode('utf-8')\n", " except urllib.error.HTTPError as exc:\n", " last_error = exc\n", " try:\n", - " error_body = exc.read().decode('utf-8')\n", + " body = exc.read().decode('utf-8', errors='replace')[:200]\n", " except Exception:\n", - " error_body = ''\n", - " if exc.code == 400:\n", - " raise RuntimeError(f'HTTP 400 Bad Request: {error_body or exc.reason}')\n", - " if exc.code == 429:\n", - " if 'hourly api request limit exceeded' in error_body.lower():\n", - " raise RuntimeError(f'HTTP 429 Hourly API limit: {error_body}')\n", - " sleep_for = retry_after_seconds(exc) or min(RATE_LIMIT_SLEEP_SECONDS * attempt, 900)\n", - " detail = f': {error_body}' if error_body else ''\n", - " print(f'Open-Meteo rate limit on attempt {attempt}/{MAX_RETRIES}{detail}; sleeping {sleep_for}s')\n", - " else:\n", - " sleep_for = min(2 ** attempt, 60)\n", - " print(f'HTTP error on attempt {attempt}/{MAX_RETRIES}: {exc}; sleeping {sleep_for}s')\n", + " body = ''\n", + " if exc.code in (400, 404):\n", + " raise RuntimeError(f'HTTP {exc.code} on {url}: {body or exc.reason}')\n", + " sleep_for = min(2 ** attempt, 60)\n", + " print(f' HTTP {exc.code} attempt {attempt}/{max_retries}: {body}; sleeping {sleep_for}s')\n", " time.sleep(sleep_for)\n", - " except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:\n", + " except (urllib.error.URLError, TimeoutError) as exc:\n", " last_error = exc\n", " sleep_for = min(2 ** attempt, 60)\n", - " print(f'Fetch failed on attempt {attempt}/{MAX_RETRIES}: {exc}; sleeping {sleep_for}s')\n", + " print(f' Network error attempt {attempt}/{max_retries}: {exc}; sleeping {sleep_for}s')\n", " time.sleep(sleep_for)\n", - " raise RuntimeError(f'Open-Meteo request failed after {MAX_RETRIES} attempt(s): {last_error}')\n", + " raise RuntimeError(f'Request failed after {max_retries} attempt(s) for {url}: {last_error}')\n", "\n", "\n", - "def summarize_daily_weather(payload, dc_row, request_url):\n", - " daily = payload.get('daily', {})\n", - " if 'time' not in daily:\n", - " reason = payload.get('reason') or 'Open-Meteo response did not include daily.time'\n", - " raise RuntimeError(reason)\n", + "def fetch_daymet(lat, lon):\n", + " \"\"\"Return (header_dict, dataframe) for one point covering START_DATE..END_DATE.\"\"\"\n", + " params = {\n", + " 'lat': f'{float(lat):.6f}',\n", + " 'lon': f'{float(lon):.6f}',\n", + " 'vars': ','.join(DAYMET_VARIABLES),\n", + " 'start': START_DATE,\n", + " 'end': END_DATE,\n", + " 'format': 'csv',\n", + " }\n", + " url = DAYMET_API_URL + '?' + urllib.parse.urlencode(params)\n", + " text = http_get(url)\n", + " # Daymet CSV preamble: Latitude / Longitude / X & Y / Tile / Elevation / \"All years; ...\" / cite line\n", + " lines = text.splitlines()\n", + " header = {'url': url}\n", + " data_start = None\n", + " for i, line in enumerate(lines):\n", + " if line.strip().startswith('year,'):\n", + " data_start = i\n", + " break\n", + " if ':' in line:\n", + " k, v = line.split(':', 1)\n", + " header[k.strip()] = v.strip()\n", + " if data_start is None:\n", + " raise RuntimeError(f'Daymet CSV missing column header. First 500 chars: {text[:500]}')\n", + " df = pd.read_csv(StringIO('\\n'.join(lines[data_start:])))\n", + " # Normalize column names: 'prcp (mm/day)' -> 'prcp', etc.\n", + " df.columns = [c.split(' ')[0] for c in df.columns]\n", + " # Daymet uses 365-day calendar (Dec 31 dropped on leap years), yday is 1..365\n", + " df['date'] = pd.to_datetime(df['year'].astype(str), format='%Y') + pd.to_timedelta(df['yday'] - 1, unit='D')\n", + " return header, df\n", "\n", - " weather = pd.DataFrame(daily)\n", - " weather['time'] = pd.to_datetime(weather['time'])\n", - " weather['year'] = weather['time'].dt.year\n", - " weather['month'] = weather['time'].dt.month\n", "\n", - " numeric_cols = [c for c in weather.columns if c not in {'time', 'year', 'month'}]\n", - " for col in numeric_cols:\n", - " weather[col] = pd.to_numeric(weather[col], errors='coerce')\n", + "# gridMET 4 km grid (from THREDDS dataset.xml): lat origin 49.4°N decreasing,\n", + "# lon origin -124.7667°E increasing, step 0.04166667°. Two sites in the same cell\n", + "# yield the same (row, col) and share one set of 30 yearly NCSS fetches.\n", + "GRIDMET_LAT_ORIGIN = 49.400000000000006\n", + "GRIDMET_LAT_STEP = -0.041666666666671404\n", + "GRIDMET_LON_ORIGIN = -124.76666663333334\n", + "GRIDMET_LON_STEP = 0.041666666666671404\n", "\n", - " weather['cooling_degree_day_c'] = (weather['temperature_2m_mean'] - CDD_BASE_C).clip(lower=0)\n", - " weather['extreme_heat_day'] = weather['temperature_2m_max'] >= EXTREME_HEAT_THRESHOLD_C\n", - " weather['extreme_wet_bulb_day'] = weather['wet_bulb_temperature_2m_max'] >= EXTREME_WET_BULB_THRESHOLD_C\n", - " weather['diurnal_temperature_range_c'] = weather['temperature_2m_max'] - weather['temperature_2m_min']\n", - " weather['windstorm_day'] = weather['wind_gusts_10m_max'] >= WINDSTORM_GUST_THRESHOLD_MS\n", - " wet_days = weather.loc[weather['precipitation_sum'] >= WET_DAY_THRESHOLD_MM, 'precipitation_sum']\n", + "GRIDMET_WIND_CACHE = {}\n", + "_GRIDMET_CACHE_STATS = {'hits': 0, 'misses': 0}\n", "\n", - " annual = (\n", - " weather.groupby('year', as_index=False)\n", - " .agg(\n", - " annual_mean_temperature_c=('temperature_2m_mean', 'mean'),\n", - " annual_cooling_degree_days_c=('cooling_degree_day_c', 'sum'),\n", - " annual_extreme_heat_days=('extreme_heat_day', 'sum'),\n", - " annual_precipitation_mm=('precipitation_sum', 'sum'),\n", - " annual_windstorm_days=('windstorm_day', 'sum'),\n", - " )\n", + "\n", + "def gridmet_cell_key(lat, lon):\n", + " row = int(round((float(lat) - GRIDMET_LAT_ORIGIN) / GRIDMET_LAT_STEP))\n", + " col = int(round((float(lon) - GRIDMET_LON_ORIGIN) / GRIDMET_LON_STEP))\n", + " return (row, col)\n", + "\n", + "\n", + "# gridMET CONUS coverage bounds (from THREDDS dataset.xml projectionBox).\n", + "# Sites outside this box (AK, HI, PR, territories) cannot get gridMET wind.\n", + "GRIDMET_CONUS_LAT_MIN = 25.04583333333333\n", + "GRIDMET_CONUS_LAT_MAX = 49.420833333333334\n", + "GRIDMET_CONUS_LON_MIN = -124.78749996666667\n", + "GRIDMET_CONUS_LON_MAX = -67.03749996666669\n", + "\n", + "\n", + "def is_in_gridmet_conus(lat, lon):\n", + " return (GRIDMET_CONUS_LAT_MIN <= float(lat) <= GRIDMET_CONUS_LAT_MAX\n", + " and GRIDMET_CONUS_LON_MIN <= float(lon) <= GRIDMET_CONUS_LON_MAX)\n", + "\n", + "\n", + "def fetch_gridmet_wind(lat, lon):\n", + " \"\"\"Return (meta_dict, dataframe[date, wind_speed_ms]) covering START_YEAR..END_YEAR.\n", + "\n", + " Cached by gridMET 4 km cell — sites in the same cell share one set of fetches.\n", + " Cache miss issues one NCSS CSV request per year (~30 requests, ~3 KB each).\"\"\"\n", + " cache_key = gridmet_cell_key(lat, lon)\n", + " cached = GRIDMET_WIND_CACHE.get(cache_key)\n", + " if cached is not None:\n", + " _GRIDMET_CACHE_STATS['hits'] += 1\n", + " return cached\n", + " _GRIDMET_CACHE_STATS['misses'] += 1\n", + "\n", + " frames = []\n", + " grid_lat = grid_lon = None\n", + " for year in range(START_YEAR, END_YEAR + 1):\n", + " url = (GRIDMET_NCSS_URL_TEMPLATE.format(year=year) +\n", + " f'?var={GRIDMET_VARIABLE}'\n", + " f'&latitude={float(lat):.6f}&longitude={float(lon):.6f}'\n", + " f'&time_start={year}-01-01T00:00:00Z&time_end={year}-12-31T23:59:59Z'\n", + " f'&accept=csv')\n", + " text = http_get(url)\n", + " df = pd.read_csv(StringIO(text))\n", + " # NCSS column names look like: time, latitude[unit=\"...\"], longitude[..], wind_speed[unit=\"m/s\"]\n", + " df = df.rename(columns={c: c.split('[')[0].strip() for c in df.columns})\n", + " df['date'] = pd.to_datetime(df['time']).dt.tz_localize(None).dt.normalize()\n", + " # NCSS CSV does NOT apply scale_factor for packed shorts; apply here.\n", + " df['wind_speed_ms'] = pd.to_numeric(df['wind_speed'], errors='coerce') * GRIDMET_SCALE_FACTOR\n", + " if grid_lat is None and 'latitude' in df.columns:\n", + " grid_lat = float(df['latitude'].iloc[0])\n", + " grid_lon = float(df['longitude'].iloc[0])\n", + " frames.append(df[['date', 'wind_speed_ms']])\n", + " result = ({'grid_lat': grid_lat, 'grid_lon': grid_lon}, pd.concat(frames, ignore_index=True))\n", + " GRIDMET_WIND_CACHE[cache_key] = result\n", + " return result\n", + "\n", + "\n", + "def saturation_vapor_pressure_pa(t_c):\n", + " \"\"\"Magnus / Tetens. T in °C, returns saturation vapor pressure in Pa.\"\"\"\n", + " return 611.2 * np.exp(17.67 * t_c / (t_c + 243.5))\n", + "\n", + "\n", + "def relative_humidity_pct(vp_pa, t_c):\n", + " e_sat = saturation_vapor_pressure_pa(t_c)\n", + " rh = 100.0 * vp_pa / e_sat\n", + " return np.clip(rh, 0.0, 100.0)\n", + "\n", + "\n", + "def wet_bulb_temperature_c(t_c, rh_pct):\n", + " \"\"\"Stull (2011) approximation. Valid for 5 ≤ RH ≤ 99 %, −20 ≤ T ≤ +50 °C.\"\"\"\n", + " rh = np.clip(rh_pct, 5.0, 99.0)\n", + " return (\n", + " t_c * np.arctan(0.151977 * np.sqrt(rh + 8.313659))\n", + " + np.arctan(t_c + rh)\n", + " - np.arctan(rh - 1.676331)\n", + " + 0.00391838 * (rh ** 1.5) * np.arctan(0.023101 * rh)\n", + " - 4.686035\n", " )\n", - " summer = weather.loc[weather['month'].isin([6, 7, 8])]\n", "\n", - " annual_precip = annual['annual_precipitation_mm'].dropna()\n", - " precip_cv = np.nan\n", - " if len(annual_precip) > 1 and annual_precip.mean() != 0:\n", - " precip_cv = float(annual_precip.std(ddof=1) / annual_precip.mean())\n", "\n", - " return {\n", + "def _parse_daymet_header(header):\n", + " def f(s):\n", + " try: return float(str(s).split()[0])\n", + " except Exception: return None\n", + " elevation_m = f(header.get('Elevation', '').replace('meters', '').strip())\n", + " tile = None\n", + " try: tile = int(header.get('Tile', '').strip())\n", + " except Exception: pass\n", + " xy = header.get('X & Y on Lambert Conformal Conic', '')\n", + " daymet_x = daymet_y = None\n", + " if xy:\n", + " parts = xy.split()\n", + " try:\n", + " daymet_x = float(parts[0]); daymet_y = float(parts[1])\n", + " except Exception:\n", + " pass\n", + " return tile, elevation_m, daymet_x, daymet_y\n", + "\n", + "\n", + "def summarize_site(dc_row, daymet_header, daymet_df, wind_info):\n", + " df = daymet_df.copy()\n", + " df['tmean'] = (df['tmin'] + df['tmax']) / 2.0\n", + " df['year'] = df['date'].dt.year\n", + " df['month'] = df['date'].dt.month\n", + "\n", + " df['rh_mean_pct'] = relative_humidity_pct(df['vp'], df['tmean'])\n", + " df['rh_at_tmax_pct'] = relative_humidity_pct(df['vp'], df['tmax'])\n", + " df['wet_bulb_mean_c'] = wet_bulb_temperature_c(df['tmean'], df['rh_mean_pct'])\n", + " df['wet_bulb_max_c'] = wet_bulb_temperature_c(df['tmax'], df['rh_at_tmax_pct'])\n", + "\n", + " df['cdd_c'] = (df['tmean'] - CDD_BASE_C).clip(lower=0)\n", + " df['extreme_heat'] = df['tmax'] >= EXTREME_HEAT_THRESHOLD_C\n", + " df['extreme_wet_bulb'] = df['wet_bulb_max_c'] >= EXTREME_WET_BULB_THRESHOLD_C\n", + " df['dtr_c'] = df['tmax'] - df['tmin']\n", + "\n", + " annual = (df.groupby('year', as_index=False)\n", + " .agg(annual_mean_t_c=('tmean','mean'),\n", + " annual_cdd=('cdd_c','sum'),\n", + " annual_extreme_heat=('extreme_heat','sum'),\n", + " annual_prcp_mm=('prcp','sum')))\n", + " summer = df.loc[df['month'].isin([6, 7, 8])]\n", + " wet_days = df.loc[df['prcp'] >= WET_DAY_THRESHOLD_MM, 'prcp']\n", + "\n", + " annual_prcp = annual['annual_prcp_mm'].dropna()\n", + " precip_cv = float(annual_prcp.std(ddof=1) / annual_prcp.mean()) if len(annual_prcp) > 1 and annual_prcp.mean() != 0 else np.nan\n", + "\n", + " tile, elevation_m, daymet_x, daymet_y = _parse_daymet_header(daymet_header)\n", + "\n", + " row = {\n", " 'master_id': dc_row.master_id,\n", - " 'source': dc_row.source,\n", - " 'name': dc_row.name,\n", - " 'operator': dc_row.operator,\n", - " 'city': dc_row.city,\n", - " 'state': dc_row.state,\n", - " 'country': dc_row.country,\n", + " 'source': dc_row.source,\n", + " 'name': dc_row.name,\n", + " 'operator': dc_row.operator,\n", + " 'city': dc_row.city,\n", + " 'state': dc_row.state,\n", + " 'country': dc_row.country,\n", " 'longitude': float(dc_row.longitude),\n", - " 'latitude': float(dc_row.latitude),\n", - " 'open_meteo_model': OPEN_METEO_MODEL,\n", - " 'climate_period_start': START_DATE,\n", - " 'climate_period_end': END_DATE,\n", - " 'api_timezone': payload.get('timezone', TIMEZONE),\n", - " 'api_grid_latitude': payload.get('latitude'),\n", - " 'api_grid_longitude': payload.get('longitude'),\n", - " 'api_elevation_m': payload.get('elevation'),\n", - " 'api_utc_offset_seconds': payload.get('utc_offset_seconds'),\n", - " 'observation_days': int(len(weather)),\n", - " 'observation_years': int(weather['year'].nunique()),\n", - " 'api_generationtime_ms': payload.get('generationtime_ms'),\n", - " 'mean_annual_temperature_c': float(annual['annual_mean_temperature_c'].mean()),\n", - " 'mean_summer_temperature_c': float(summer['temperature_2m_mean'].mean()) if len(summer) else np.nan,\n", - " 'max_daily_temperature_c': float(weather['temperature_2m_max'].max()),\n", - " 'mean_wet_bulb_temperature_c': float(weather['wet_bulb_temperature_2m_mean'].mean()),\n", - " 'max_wet_bulb_temperature_c': float(weather['wet_bulb_temperature_2m_max'].max()),\n", - " 'mean_relative_humidity_pct': float(weather['relative_humidity_2m_mean'].mean()),\n", - " 'cooling_degree_days_c': float(weather['cooling_degree_day_c'].sum()),\n", - " 'annual_cooling_degree_days_c_mean': float(annual['annual_cooling_degree_days_c'].mean()),\n", - " 'extreme_heat_days': int(weather['extreme_heat_day'].sum()),\n", - " 'annual_extreme_heat_days_mean': float(annual['annual_extreme_heat_days'].mean()),\n", - " 'extreme_wet_bulb_days': int(weather['extreme_wet_bulb_day'].sum()),\n", - " 'mean_diurnal_temperature_range_c': float(weather['diurnal_temperature_range_c'].mean()),\n", - " 'precipitation_total_mm': float(weather['precipitation_sum'].sum()),\n", - " 'annual_precipitation_mm_mean': float(annual['annual_precipitation_mm'].mean()),\n", - " 'annual_precipitation_cv': precip_cv,\n", + " 'latitude': float(dc_row.latitude),\n", + "\n", + " 'daymet_dataset_version': DAYMET_DATASET_VERSION,\n", + " 'gridmet_dataset_version': GRIDMET_DATASET_VERSION if wind_info is not None else None,\n", + " 'climate_period_start': START_DATE,\n", + " 'climate_period_end': END_DATE,\n", + " 'daymet_tile': tile,\n", + " 'daymet_elevation_m': elevation_m,\n", + " 'daymet_grid_x_m': daymet_x,\n", + " 'daymet_grid_y_m': daymet_y,\n", + " 'gridmet_grid_latitude': wind_info['grid_lat'] if wind_info is not None else None,\n", + " 'gridmet_grid_longitude': wind_info['grid_lon'] if wind_info is not None else None,\n", + " 'observation_days': int(len(df)),\n", + " 'observation_years': int(df['year'].nunique()),\n", + "\n", + " 'mean_annual_temperature_c': float(annual['annual_mean_t_c'].mean()),\n", + " 'mean_summer_temperature_c': float(summer['tmean'].mean()) if len(summer) else np.nan,\n", + " 'max_daily_temperature_c': float(df['tmax'].max()),\n", + " 'min_daily_temperature_c': float(df['tmin'].min()),\n", + " 'mean_diurnal_temperature_range_c': float(df['dtr_c'].mean()),\n", + "\n", + " 'mean_relative_humidity_pct': float(df['rh_mean_pct'].mean()),\n", + " 'mean_wet_bulb_temperature_c': float(df['wet_bulb_mean_c'].mean()),\n", + " 'max_wet_bulb_temperature_c': float(df['wet_bulb_max_c'].max()),\n", + " 'extreme_wet_bulb_days': int(df['extreme_wet_bulb'].sum()),\n", + "\n", + " 'cooling_degree_days_c': float(df['cdd_c'].sum()),\n", + " 'annual_cooling_degree_days_c_mean': float(annual['annual_cdd'].mean()),\n", + " 'extreme_heat_days': int(df['extreme_heat'].sum()),\n", + " 'annual_extreme_heat_days_mean': float(annual['annual_extreme_heat'].mean()),\n", + "\n", + " 'precipitation_total_mm': float(df['prcp'].sum()),\n", + " 'annual_precipitation_mm_mean': float(annual['annual_prcp_mm'].mean()),\n", + " 'annual_precipitation_cv': precip_cv,\n", " 'wet_day_precipitation_p95_mm': float(wet_days.quantile(0.95)) if len(wet_days) else 0.0,\n", - " 'windstorm_days': int(weather['windstorm_day'].sum()),\n", - " 'annual_windstorm_days_mean': float(annual['annual_windstorm_days'].mean()),\n", - " 'max_wind_gust_ms': float(weather['wind_gusts_10m_max'].max()),\n", - " 'mean_soil_moisture_0_to_100cm': float(weather['soil_moisture_0_to_100cm_mean'].mean()),\n", - " 'drought_severity_index_source_status': 'needs_external_source',\n", + "\n", + " 'mean_wind_speed_ms': None,\n", + " 'max_daily_mean_wind_speed_ms': None,\n", + " 'sustained_wind_days': None,\n", + " 'annual_sustained_wind_days_mean': None,\n", + " 'wind_status': 'pending_gridmet',\n", + "\n", + " 'drought_severity_index_source_status': 'needs_external_source',\n", " 'wildfire_smoke_exposure_source_status': 'needs_external_source',\n", - " 'open_meteo_daily_variables': DAILY_VARIABLES,\n", - " 'open_meteo_url': request_url,\n", + "\n", + " 'daymet_variables': DAYMET_VARIABLES,\n", + " 'daymet_url': daymet_header.get('url'),\n", " }\n", "\n", + " if wind_info is not None:\n", + " wdf = wind_info['data'].copy()\n", + " wdf['year'] = wdf['date'].dt.year\n", + " wdf['sustained'] = wdf['wind_speed_ms'] >= SUSTAINED_WIND_THRESHOLD_MS\n", + " annual_wind = wdf.groupby('year', as_index=False).agg(d=('sustained', 'sum'))\n", + " row.update({\n", + " 'mean_wind_speed_ms': float(wdf['wind_speed_ms'].mean()),\n", + " 'max_daily_mean_wind_speed_ms': float(wdf['wind_speed_ms'].max()),\n", + " 'sustained_wind_days': int(wdf['sustained'].sum()),\n", + " 'annual_sustained_wind_days_mean': float(annual_wind['d'].mean()),\n", + " 'wind_status': 'gridmet',\n", + " })\n", "\n", - "def is_request_too_large_error(exc):\n", - " text = str(exc).lower()\n", - " return 'http 400' in text and 'requests too much data' in text\n", - "\n", - "\n", - "def fetch_batch_summaries(batch):\n", - " url = build_open_meteo_url(batch['latitude'].to_list(), batch['longitude'].to_list())\n", - " try:\n", - " payload = fetch_json(url)\n", - " payloads = payload if isinstance(payload, list) else [payload]\n", - " if len(payloads) != len(batch):\n", - " raise RuntimeError(f'Open-Meteo returned {len(payloads)} payload(s) for {len(batch)} requested location(s)')\n", - " rows = [\n", - " summarize_daily_weather(location_payload, dc_row, url)\n", - " for dc_row, location_payload in zip(batch.itertuples(index=False), payloads)\n", - " ]\n", - " return rows, []\n", - " except Exception as exc:\n", - " if is_request_too_large_error(exc) and len(batch) > 1:\n", - " midpoint = len(batch) // 2\n", - " print(f'Batch of {len(batch)} locations is too large; splitting into {midpoint} + {len(batch) - midpoint}')\n", - " left_rows, left_errors = fetch_batch_summaries(batch.iloc[:midpoint].copy())\n", - " time.sleep(REQUEST_SLEEP_SECONDS)\n", - " right_rows, right_errors = fetch_batch_summaries(batch.iloc[midpoint:].copy())\n", - " return left_rows + right_rows, left_errors + right_errors\n", - " batch_errors = [\n", - " {'master_id': dc_row.master_id, 'error': str(exc), 'url': url}\n", - " for dc_row in batch.itertuples(index=False)\n", - " ]\n", - " return [], batch_errors\n", - "\n", - "\n", - "if targets.empty:\n", - " climate_rows = []\n", - " climate = pd.DataFrame()\n", - " print('No new target rows. Set REFRESH_EXISTING=True to recompute existing rows.')\n", - "else:\n", - " climate_rows = []\n", - " errors = []\n", - " total = len(targets)\n", - " checkpoint_path = Path('output/open_meteo_historical_climate_checkpoint.csv')\n", - " checkpoint_path.parent.mkdir(exist_ok=True)\n", - "\n", - " for batch_start in range(0, total, BATCH_SIZE):\n", - " batch = targets.iloc[batch_start:batch_start + BATCH_SIZE].copy()\n", - " batch_end = batch_start + len(batch)\n", - " batch_rows, batch_errors = fetch_batch_summaries(batch)\n", - " climate_rows.extend(batch_rows)\n", - " errors.extend(batch_errors)\n", - " if batch_errors:\n", - " print(f'ERROR batch {batch_start + 1:,}-{batch_end:,}: {batch_errors[0][\"error\"]}')\n", - " if '429' in batch_errors[0]['error'] or 'Too Many Requests' in batch_errors[0]['error'] or 'Hourly API limit' in batch_errors[0]['error']:\n", - " print('Open-Meteo rate limit reached; stopping this run so successes can be saved before retrying later.')\n", - " break\n", - "\n", - " if climate_rows:\n", - " pd.DataFrame(climate_rows).to_csv(checkpoint_path, index=False)\n", - " print(f'Checkpointed {len(climate_rows):,} row(s) to {checkpoint_path}')\n", - " print(f'Processed {batch_end:,}/{total:,}; successes={len(climate_rows):,}; errors={len(errors):,}')\n", - " time.sleep(REQUEST_SLEEP_SECONDS)\n", - "\n", - " climate = pd.DataFrame(climate_rows)\n", - " error_log = pd.DataFrame(errors)\n", - " print(f'Summarized {len(climate):,} climate row(s); errors={len(error_log):,}')\n", - " display(climate.head())\n", - " if not error_log.empty:\n", - " display(error_log)\n" + " return row\n" ] }, { @@ -579,7 +648,7 @@ "id": "11", "metadata": {}, "source": [ - "## Upsert to PostGIS" + "## Upsert Helper" ] }, { @@ -590,35 +659,49 @@ "outputs": [], "source": [ "UPSERT_COLUMNS = [\n", - " 'master_id', 'source', 'name', 'operator', 'city', 'state', 'country', 'longitude', 'latitude',\n", - " 'open_meteo_model', 'climate_period_start', 'climate_period_end', 'api_timezone',\n", - " 'api_grid_latitude', 'api_grid_longitude', 'api_elevation_m', 'api_utc_offset_seconds',\n", - " 'observation_days', 'observation_years', 'api_generationtime_ms',\n", - " 'mean_annual_temperature_c', 'mean_summer_temperature_c', 'max_daily_temperature_c',\n", - " 'mean_wet_bulb_temperature_c', 'max_wet_bulb_temperature_c', 'mean_relative_humidity_pct',\n", + " 'master_id', 'source', 'name', 'operator', 'city', 'state', 'country',\n", + " 'longitude', 'latitude',\n", + " 'daymet_dataset_version', 'gridmet_dataset_version',\n", + " 'climate_period_start', 'climate_period_end',\n", + " 'daymet_tile', 'daymet_elevation_m', 'daymet_grid_x_m', 'daymet_grid_y_m',\n", + " 'gridmet_grid_latitude', 'gridmet_grid_longitude',\n", + " 'observation_days', 'observation_years',\n", + " 'mean_annual_temperature_c', 'mean_summer_temperature_c',\n", + " 'max_daily_temperature_c', 'min_daily_temperature_c',\n", + " 'mean_diurnal_temperature_range_c',\n", + " 'mean_relative_humidity_pct',\n", + " 'mean_wet_bulb_temperature_c', 'max_wet_bulb_temperature_c',\n", + " 'extreme_wet_bulb_days',\n", " 'cooling_degree_days_c', 'annual_cooling_degree_days_c_mean',\n", - " 'extreme_heat_days', 'annual_extreme_heat_days_mean', 'extreme_wet_bulb_days',\n", - " 'mean_diurnal_temperature_range_c', 'precipitation_total_mm',\n", - " 'annual_precipitation_mm_mean', 'annual_precipitation_cv', 'wet_day_precipitation_p95_mm',\n", - " 'windstorm_days', 'annual_windstorm_days_mean', 'max_wind_gust_ms',\n", - " 'mean_soil_moisture_0_to_100cm', 'drought_severity_index_source_status',\n", - " 'wildfire_smoke_exposure_source_status', 'open_meteo_daily_variables', 'open_meteo_url',\n", + " 'extreme_heat_days', 'annual_extreme_heat_days_mean',\n", + " 'precipitation_total_mm', 'annual_precipitation_mm_mean',\n", + " 'annual_precipitation_cv', 'wet_day_precipitation_p95_mm',\n", + " 'mean_wind_speed_ms', 'max_daily_mean_wind_speed_ms',\n", + " 'sustained_wind_days', 'annual_sustained_wind_days_mean',\n", + " 'wind_status',\n", + " 'drought_severity_index_source_status',\n", + " 'wildfire_smoke_exposure_source_status',\n", + " 'daymet_variables', 'daymet_url',\n", "]\n", "\n", "\n", "def clean_db_value(value):\n", " if isinstance(value, (list, tuple)):\n", " return list(value)\n", + " if value is None:\n", + " return None\n", " if isinstance(value, float) and np.isnan(value):\n", " return None\n", - " if pd.isna(value):\n", - " return None\n", + " try:\n", + " if pd.isna(value):\n", + " return None\n", + " except (TypeError, ValueError):\n", + " pass\n", " return value\n", "\n", "\n", "def upsert_climate_rows(rows):\n", " if not rows:\n", - " print('No rows to upsert.')\n", " return\n", "\n", " values = [\n", @@ -628,8 +711,7 @@ " col_sql = sql.SQL(', ').join(map(sql.Identifier, UPSERT_COLUMNS))\n", " update_assignments = sql.SQL(', ').join(\n", " sql.SQL('{} = excluded.{}').format(sql.Identifier(col), sql.Identifier(col))\n", - " for col in UPSERT_COLUMNS\n", - " if col != 'master_id'\n", + " for col in UPSERT_COLUMNS if col != 'master_id'\n", " )\n", "\n", " insert_sql = sql.SQL('''\n", @@ -637,7 +719,7 @@ " values %s\n", " on conflict (master_id) do update set\n", " {updates},\n", - " geom = excluded.geom,\n", + " geom = excluded.geom,\n", " fetched_at = now(),\n", " updated_at = now()\n", " ''').format(\n", @@ -647,19 +729,13 @@ " )\n", "\n", " template = '(' + ', '.join(['%s'] * len(UPSERT_COLUMNS)) + ', ST_SetSRID(ST_MakePoint(%s, %s), 4326), now())'\n", - " values_with_geom = [row + (row[UPSERT_COLUMNS.index('longitude')], row[UPSERT_COLUMNS.index('latitude')]) for row in values]\n", + " lon_i = UPSERT_COLUMNS.index('longitude')\n", + " lat_i = UPSERT_COLUMNS.index('latitude')\n", + " values_with_geom = [r + (r[lon_i], r[lat_i]) for r in values]\n", "\n", " with get_conn() as conn:\n", " with conn.cursor() as cur:\n", - " execute_values(cur, insert_sql.as_string(conn), values_with_geom, template=template, page_size=500)\n", - " cur.execute(sql.SQL('analyze {}').format(sql.SQL(CLIMATE_TABLE)))\n", - " print(f'Upserted {len(rows):,} row(s) into {CLIMATE_TABLE}')\n", - "\n", - "\n", - "if DRY_RUN:\n", - " print('DRY_RUN=True; database was not modified.')\n", - "else:\n", - " upsert_climate_rows(climate_rows)\n" + " execute_values(cur, insert_sql.as_string(conn), values_with_geom, template=template, page_size=500)\n" ] }, { @@ -667,7 +743,13 @@ "id": "13", "metadata": {}, "source": [ - "## Inspect Results" + "## Fetch + Upsert (per site, with checkpointing)\n", + "\n", + "Each site is fetched, summarized, **and upserted to Postgres immediately**. That way:\n", + "\n", + "- A mid-run crash never loses more than the current site.\n", + "- Re-running with `REFRESH_EXISTING=False` skips everything already in `data_center_historical_climate` (filtered in `load_target_points`).\n", + "- The CSV checkpoint at `output/historical_climate_checkpoint.csv` is purely a debug artifact; the DB is the source of truth.\n" ] }, { @@ -677,55 +759,133 @@ "metadata": {}, "outputs": [], "source": [ - "summary_query = f'''\n", - "select\n", - " count(*) as rows,\n", - " min(climate_period_start) as min_period_start,\n", - " max(climate_period_end) as max_period_end,\n", - " count(*) filter (where drought_severity_index_source_status = 'needs_external_source') as drought_external_needed,\n", - " count(*) filter (where wildfire_smoke_exposure_source_status = 'needs_external_source') as smoke_external_needed\n", - "from {CLIMATE_TABLE}\n", - "where open_meteo_model = %s\n", - " and climate_period_start = %s::date\n", - " and climate_period_end = %s::date\n", - "'''\n", + "if targets.empty:\n", + " climate_rows = []\n", + " print('No new target rows. Set REFRESH_EXISTING=True to recompute existing rows.')\n", + "else:\n", + " climate_rows = []\n", + " errors = []\n", + " total = len(targets)\n", + " checkpoint_path = Path('output/historical_climate_checkpoint.csv')\n", + " checkpoint_path.parent.mkdir(exist_ok=True)\n", "\n", - "sample_query = f'''\n", - "select\n", - " c.master_id,\n", - " c.name,\n", - " c.state,\n", - " c.mean_annual_temperature_c,\n", - " c.mean_summer_temperature_c,\n", - " c.cooling_degree_days_c,\n", - " c.extreme_heat_days,\n", - " c.annual_precipitation_cv,\n", - " c.windstorm_days\n", - "from {CLIMATE_TABLE} c\n", - "order by c.cooling_degree_days_c desc nulls last\n", - "limit 20\n", - "'''\n", + " for i, dc_row in enumerate(targets.itertuples(index=False), start=1):\n", + " try:\n", + " daymet_header, daymet_df = fetch_daymet(dc_row.latitude, dc_row.longitude)\n", + " wind_info = None\n", + " wind_skip_reason = None\n", + " if INCLUDE_WIND:\n", + " if is_in_gridmet_conus(dc_row.latitude, dc_row.longitude):\n", + " grid_meta, wind_df = fetch_gridmet_wind(dc_row.latitude, dc_row.longitude)\n", + " wind_info = {\n", + " 'grid_lat': grid_meta['grid_lat'],\n", + " 'grid_lon': grid_meta['grid_lon'],\n", + " 'data': wind_df,\n", + " }\n", + " else:\n", + " wind_skip_reason = 'unavailable_outside_conus'\n", + " row = summarize_site(dc_row, daymet_header, daymet_df, wind_info)\n", + " if wind_skip_reason is not None:\n", + " row['wind_status'] = wind_skip_reason\n", + " climate_rows.append(row)\n", "\n", - "with get_conn() as conn:\n", - " summary = pd.read_sql_query(summary_query, conn, params=(OPEN_METEO_MODEL, START_DATE, END_DATE))\n", - " sample = pd.read_sql_query(sample_query, conn)\n", + " if not DRY_RUN:\n", + " upsert_climate_rows([row])\n", + " pd.DataFrame(climate_rows).to_csv(checkpoint_path, index=False)\n", "\n", - "display(summary)\n", - "display(sample)\n" + " print(f'{i:>5,}/{total:,} {dc_row.master_id} '\n", + " f'T_mean={row[\"mean_annual_temperature_c\"]:.2f}°C '\n", + " f'CDD={row[\"cooling_degree_days_c\"]:,.0f} '\n", + " f'wind={row[\"wind_status\"]} '\n", + " f'successes={len(climate_rows):,} errors={len(errors):,}')\n", + " except Exception as exc:\n", + " errors.append({'master_id': dc_row.master_id, 'error': str(exc)})\n", + " print(f'ERROR {i:>5,}/{total:,} ({dc_row.master_id}): {exc}')\n", + "\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + "\n", + " climate = pd.DataFrame(climate_rows)\n", + " error_log = pd.DataFrame(errors)\n", + " print(f'\\nDone: {len(climate):,} successes / {len(error_log):,} errors')\n", + " if INCLUDE_WIND:\n", + " h = _GRIDMET_CACHE_STATS['hits']; m = _GRIDMET_CACHE_STATS['misses']\n", + " tot = h + m\n", + " print(f'gridMET cache: {h:,} hits / {m:,} misses ({100*h/tot:.1f}% hit rate, {len(GRIDMET_WIND_CACHE):,} unique cells)' if tot else 'gridMET cache: no calls')\n", + " if not climate.empty:\n", + " display(climate.head())\n", + " if not error_log.empty:\n", + " display(error_log)\n" ] }, { "cell_type": "markdown", "id": "15", "metadata": {}, + "source": [ + "## Inspect Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "summary_query = f'''\n", + "select\n", + " count(*) as rows,\n", + " min(climate_period_start) as min_period_start,\n", + " max(climate_period_end) as max_period_end,\n", + " count(*) filter (where wind_status = 'gridmet') as with_wind,\n", + " count(*) filter (where wind_status = 'pending_gridmet') as wind_pending,\n", + " count(*) filter (where drought_severity_index_source_status = 'needs_external_source') as drought_external_needed,\n", + " count(*) filter (where wildfire_smoke_exposure_source_status = 'needs_external_source') as smoke_external_needed\n", + "from {CLIMATE_TABLE}\n", + "where daymet_dataset_version = %s\n", + " and climate_period_start = %s::date\n", + " and climate_period_end = %s::date\n", + "'''\n", + "\n", + "sample_query = f'''\n", + "select\n", + " c.master_id, c.name, c.state,\n", + " round(c.mean_annual_temperature_c::numeric, 2) as mean_t_c,\n", + " round(c.mean_summer_temperature_c::numeric, 2) as summer_t_c,\n", + " round(c.mean_relative_humidity_pct::numeric, 1) as rh_pct,\n", + " round(c.mean_wet_bulb_temperature_c::numeric, 2) as wb_mean_c,\n", + " round(c.cooling_degree_days_c::numeric, 0) as cdd_total,\n", + " c.extreme_heat_days,\n", + " round(c.annual_precipitation_cv::numeric, 3) as precip_cv,\n", + " round(c.mean_wind_speed_ms::numeric, 2) as wind_mean_ms,\n", + " c.sustained_wind_days,\n", + " c.wind_status\n", + "from {CLIMATE_TABLE} c\n", + "order by c.cooling_degree_days_c desc nulls last\n", + "limit 20\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " summary = pd.read_sql_query(summary_query, conn, params=(DAYMET_DATASET_VERSION, START_DATE, END_DATE))\n", + " sample = pd.read_sql_query(sample_query, conn)\n", + "\n", + "display(summary)\n", + "display(sample)\n" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, "source": [ "## Notes for the Next Data Sources\n", "\n", - "Open-Meteo gets us the weather/climate burden metrics and a wind-gust proxy for windstorm frequency. These still need better external sources:\n", + "Daymet (T/precip/VP at 1 km) + gridMET (wind at 4 km) cover the core weather/climate burden metrics for CONUS data centers. Two metrics still need better external sources:\n", "\n", - "- Drought severity index: NOAA/NCEI climate division or gridMET/USDM-derived drought measures are better candidates.\n", - "- Wildfire smoke exposure: NOAA HMS smoke, EPA AirNow/monitor-derived PM2.5, or satellite smoke products are better candidates.\n", - "- Windstorm frequency: the current field is a reanalysis gust threshold proxy. FEMA/NCEI storm events can be added later as observed hazard-event counts.\n" + "- **Drought severity index** — gridMET also publishes `pdsi` (Palmer Drought Severity Index) and `etr` (reference ET) at the same THREDDS server. A follow-up cell can pull PDSI percentiles using the same NCSS pattern as `vs` here. Alternatively, the US Drought Monitor (USDM) shapefiles give weekly drought categories that can be spatially joined to the DC point.\n", + "- **Wildfire smoke exposure** — NOAA HMS smoke shapefiles (daily plume polygons, 2005–present) or EPA AirNow PM2.5 monitor data. Smoke-day counts would be directly comparable to the heat-day metric here.\n", + "\n", + "**Metric change vs. the prior Open-Meteo implementation.** The old `windstorm_days` field counted days where the daily *max wind gust* exceeded 17.2 m/s (~38 mph, ERA5 reanalysis gust). gridMET reports only daily *mean* wind, not gusts, so the new `sustained_wind_days` field counts days with daily-mean wind ≥ 8 m/s (~18 mph). These are fundamentally different signals — do not compare values across the two columns. If true gust/storm event counts are needed, NOAA's Storm Events Database is the appropriate next source.\n" ] } ], diff --git a/open_meteo_historical_data_centers.ipynb b/open_meteo_historical_data_centers.ipynb new file mode 100644 index 0000000..a60ceb2 --- /dev/null +++ b/open_meteo_historical_data_centers.ipynb @@ -0,0 +1,854 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Historical Climate for Master Data Centers (Daymet + gridMET)\n", + "\n", + "This notebook builds or refreshes `public.data_center_historical_climate`, a one-row-per-data-center climate summary keyed by `master_id`.\n", + "\n", + "**Source switch on 2026-05-20.** Previously this notebook used Open-Meteo's archive API, but a 30-year × 9-variable request consumes ~7,000 of the free 10,000 daily call budget, limiting the run to ~1 site/day. Two free, generous federal sources replace it:\n", + "\n", + "- **Daymet v4 R1** (ORNL DAAC): daily 1 km gridded `tmin`, `tmax`, `prcp`, `vp` (vapor pressure) for North America, 1980–present. Pulled via the single-pixel REST API.\n", + "- **gridMET / METDATA** (U. Idaho): daily 4 km gridded daily-mean wind speed (`vs`) for CONUS, 1979–present. Pulled via THREDDS NetCDF Subset Service as CSV (no `xarray` dependency).\n", + "\n", + "Climate normal period: **1991–2020** (standard WMO baseline).\n", + "\n", + "| Metric | Field(s) | Source / derivation |\n", + "| --- | --- | --- |\n", + "| Mean annual / summer temperature | `mean_annual_temperature_c`, `mean_summer_temperature_c` | Daymet `(tmin + tmax) / 2` |\n", + "| Max / min daily temperature | `max_daily_temperature_c`, `min_daily_temperature_c` | Daymet `tmax`, `tmin` |\n", + "| Diurnal temperature range | `mean_diurnal_temperature_range_c` | Daymet `tmax − tmin` |\n", + "| Relative humidity | `mean_relative_humidity_pct` | Derived `100 × vp / e_sat(T)` via Magnus formula |\n", + "| Wet bulb temperature | `mean_wet_bulb_temperature_c`, `max_wet_bulb_temperature_c`, `extreme_wet_bulb_days` | Stull (2011) approximation from T + derived RH |\n", + "| Cooling degree days | `cooling_degree_days_c`, `annual_cooling_degree_days_c_mean` | Σ max(0, T_mean − 18.3 °C) |\n", + "| Extreme heat days | `extreme_heat_days`, `annual_extreme_heat_days_mean` | Count days with `tmax ≥ 35 °C` |\n", + "| Precipitation | `precipitation_total_mm`, `annual_precipitation_mm_mean`, `annual_precipitation_cv`, `wet_day_precipitation_p95_mm` | Daymet `prcp` aggregated |\n", + "| Wind | `mean_wind_speed_ms`, `max_daily_mean_wind_speed_ms`, `sustained_wind_days`, `annual_sustained_wind_days_mean` | gridMET `vs` (daily-mean wind). **Sustained-wind day = daily mean ≥ 8 m/s** — this is *not* the same as the old Open-Meteo gust threshold; see notes at end. |\n", + "| Drought severity, wildfire smoke | `*_source_status` columns | External sources needed (USDM, gridMET PDSI, NOAA HMS) |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import math\n", + "import os\n", + "import time\n", + "import urllib.error\n", + "import urllib.parse\n", + "import urllib.request\n", + "from io import StringIO\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import psycopg2\n", + "from psycopg2 import sql\n", + "from psycopg2.extras import execute_values\n", + "\n", + "pd.set_option('display.max_columns', 100)\n", + "pd.set_option('display.max_rows', 120)\n", + "\n", + "print('pandas:', pd.__version__)\n", + "print('numpy:', np.__version__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "def load_env_file(env_path: str = '.env') -> None:\n", + " p = Path(env_path)\n", + " if not p.exists():\n", + " print(f'No {env_path} file found in {Path.cwd()}')\n", + " return\n", + " loaded = 0\n", + " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", + " line = raw_line.strip()\n", + " if not line or line.startswith('#') or '=' not in line:\n", + " continue\n", + " key, value = line.split('=', 1)\n", + " key = key.strip()\n", + " value = value.strip().strip('\"').strip(\"'\")\n", + " if key and key not in os.environ:\n", + " os.environ[key] = value\n", + " loaded += 1\n", + " print(f'Loaded {loaded} env var(s) from {env_path}')\n", + "\n", + "\n", + "def require_env(keys):\n", + " missing = [k for k in keys if not os.getenv(k)]\n", + " if missing:\n", + " raise EnvironmentError(\n", + " 'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n", + " '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n", + " )\n", + "\n", + "\n", + "load_env_file('.env')\n", + "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n", + "require_env(required_keys)\n", + "\n", + "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", + "MASTER_TABLE = 'public.master_data_centers'\n", + "CLIMATE_TABLE = 'public.data_center_historical_climate'\n", + "\n", + "\n", + "def get_conn():\n", + " return psycopg2.connect(\n", + " host=os.environ['PGWEB_HOST'],\n", + " port=os.environ['PGWEB_PORT'],\n", + " user=os.environ['PGWEB_USER'],\n", + " password=os.environ['PGWEB_PASSWORD'],\n", + " dbname='data_centers',\n", + " )\n", + "\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('select current_database(), current_user')\n", + " db, usr = cur.fetchone()\n", + " print('Connected to DB:', db)\n", + " print('As user:', usr)\n", + " cur.execute('create extension if not exists postgis')\n", + " cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n", + " if cur.fetchone()[0] is None:\n", + " raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n", + " cur.execute(sql.SQL('select count(*) from {}').format(sql.SQL(MASTER_TABLE)))\n", + " print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## Parameters\n", + "\n", + "Climate normal period defaults to **1991–2020**, matching the current WMO baseline. Both Daymet and gridMET cover this range fully.\n", + "\n", + "- `REFRESH_EXISTING=True` recomputes existing rows (e.g. after a threshold change).\n", + "- `INCLUDE_WIND=False` skips the per-site gridMET pass; wind columns stay null and `wind_status='pending_gridmet'`. Useful for a fast first pass since Daymet alone is one HTTP call per site, while gridMET is ~30 (one per year).\n", + "- `MAX_POINTS=3` for testing, `None` for all eligible sites.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Daymet (T, precip, vapor pressure) ----\n", + "DAYMET_API_URL = 'https://daymet.ornl.gov/single-pixel/api/data'\n", + "DAYMET_VARIABLES = ['tmin', 'tmax', 'prcp', 'vp']\n", + "DAYMET_DATASET_VERSION = 'daymet_v4r1'\n", + "\n", + "# ---- gridMET (daily-mean wind) ----\n", + "GRIDMET_NCSS_URL_TEMPLATE = 'https://thredds.northwestknowledge.net/thredds/ncss/MET/vs/vs_{year}.nc'\n", + "GRIDMET_VARIABLE = 'wind_speed'\n", + "GRIDMET_SCALE_FACTOR = 0.1 # NCSS CSV returns packed short ints; multiply to get m/s\n", + "GRIDMET_DATASET_VERSION = 'gridmet_vs'\n", + "\n", + "# ---- Period ----\n", + "START_DATE = '1991-01-01'\n", + "END_DATE = '2020-12-31'\n", + "START_YEAR = int(START_DATE[:4])\n", + "END_YEAR = int(END_DATE[:4])\n", + "\n", + "# ---- Run control ----\n", + "REFRESH_EXISTING = False\n", + "INCLUDE_WIND = True\n", + "DRY_RUN = False\n", + "MAX_POINTS = 3 # small int for testing; None for all eligible\n", + "REQUEST_SLEEP_SECONDS = 0.5 # politeness pause between sites\n", + "REQUEST_TIMEOUT_SECONDS = 90\n", + "MAX_RETRIES = 5\n", + "\n", + "# ---- Metric thresholds ----\n", + "CDD_BASE_C = 18.3\n", + "EXTREME_HEAT_THRESHOLD_C = 35.0\n", + "EXTREME_WET_BULB_THRESHOLD_C = 28.0\n", + "SUSTAINED_WIND_THRESHOLD_MS = 8.0 # ~18 mph daily-mean wind. NOT a gust threshold.\n", + "WET_DAY_THRESHOLD_MM = 1.0\n", + "\n", + "print(f'Period: {START_DATE} to {END_DATE}')\n", + "print(f'Target table: {CLIMATE_TABLE}')\n", + "print(f'Daymet vars: {DAYMET_VARIABLES}')\n", + "print(f'Include wind: {INCLUDE_WIND} (gridMET sustained-wind threshold = {SUSTAINED_WIND_THRESHOLD_MS} m/s daily mean)')\n", + "print(f'Refresh: {REFRESH_EXISTING}')\n", + "print(f'Max sites: {MAX_POINTS}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "## Create Target Table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "CREATE_CLIMATE_TABLE_SQL = f'''\n", + "create table if not exists {CLIMATE_TABLE} (\n", + " master_id text primary key references public.master_data_centers(master_id) on delete cascade,\n", + " source text,\n", + " name text,\n", + " operator text,\n", + " city text,\n", + " state text,\n", + " country text,\n", + " longitude double precision not null,\n", + " latitude double precision not null,\n", + " geom geometry(Point, 4326),\n", + "\n", + " daymet_dataset_version text not null,\n", + " gridmet_dataset_version text,\n", + " climate_period_start date not null,\n", + " climate_period_end date not null,\n", + " daymet_tile integer,\n", + " daymet_elevation_m double precision,\n", + " daymet_grid_x_m double precision,\n", + " daymet_grid_y_m double precision,\n", + " gridmet_grid_latitude double precision,\n", + " gridmet_grid_longitude double precision,\n", + " observation_days integer not null,\n", + " observation_years integer not null,\n", + "\n", + " -- temperature\n", + " mean_annual_temperature_c double precision,\n", + " mean_summer_temperature_c double precision,\n", + " max_daily_temperature_c double precision,\n", + " min_daily_temperature_c double precision,\n", + " mean_diurnal_temperature_range_c double precision,\n", + "\n", + " -- humidity / wet-bulb (derived from Daymet T + vp)\n", + " mean_relative_humidity_pct double precision,\n", + " mean_wet_bulb_temperature_c double precision,\n", + " max_wet_bulb_temperature_c double precision,\n", + " extreme_wet_bulb_days integer,\n", + "\n", + " -- cooling load / heat extremes\n", + " cooling_degree_days_c double precision,\n", + " annual_cooling_degree_days_c_mean double precision,\n", + " extreme_heat_days integer,\n", + " annual_extreme_heat_days_mean double precision,\n", + "\n", + " -- precipitation\n", + " precipitation_total_mm double precision,\n", + " annual_precipitation_mm_mean double precision,\n", + " annual_precipitation_cv double precision,\n", + " wet_day_precipitation_p95_mm double precision,\n", + "\n", + " -- wind (gridMET daily-mean, NOT gust)\n", + " mean_wind_speed_ms double precision,\n", + " max_daily_mean_wind_speed_ms double precision,\n", + " sustained_wind_days integer,\n", + " annual_sustained_wind_days_mean double precision,\n", + " wind_status text not null default 'pending_gridmet',\n", + "\n", + " -- external-source placeholders\n", + " drought_severity_index_source_status text not null default 'needs_external_source',\n", + " wildfire_smoke_exposure_source_status text not null default 'needs_external_source',\n", + "\n", + " -- provenance\n", + " daymet_variables text[] not null,\n", + " daymet_url text,\n", + " fetched_at timestamptz not null default now(),\n", + " updated_at timestamptz not null default now()\n", + ");\n", + "\n", + "create index if not exists data_center_hist_climate_geom_gix\n", + " on {CLIMATE_TABLE} using gist (geom);\n", + "create index if not exists data_center_hist_climate_state_idx\n", + " on {CLIMATE_TABLE} (state);\n", + "create index if not exists data_center_hist_climate_period_idx\n", + " on {CLIMATE_TABLE} (climate_period_start, climate_period_end);\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(CREATE_CLIMATE_TABLE_SQL)\n", + " cur.execute(sql.SQL('select count(*) from {}').format(sql.SQL(CLIMATE_TABLE)))\n", + " existing = cur.fetchone()[0]\n", + " print(f'{CLIMATE_TABLE} currently has {existing:,} row(s)')\n" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "## Load Data Centers Needing Climate Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "def load_target_points(refresh_existing=False, max_points=None):\n", + " exists_filter = ''\n", + " params = []\n", + " if not refresh_existing:\n", + " exists_filter = f'''\n", + " and not exists (\n", + " select 1\n", + " from {CLIMATE_TABLE} c\n", + " where c.master_id = m.master_id\n", + " and c.daymet_dataset_version = %s\n", + " and c.climate_period_start = %s::date\n", + " and c.climate_period_end = %s::date\n", + " )\n", + " '''\n", + " params = [DAYMET_DATASET_VERSION, START_DATE, END_DATE]\n", + "\n", + " limit_sql = ''\n", + " if max_points is not None:\n", + " limit_sql = 'limit %s'\n", + " params.append(int(max_points))\n", + "\n", + " query = f'''\n", + " select\n", + " m.master_id, m.source, m.name, m.operator,\n", + " m.city, m.state, m.country,\n", + " m.longitude, m.latitude\n", + " from {MASTER_TABLE} m\n", + " where m.longitude is not null\n", + " and m.latitude is not null\n", + " and m.geom is not null\n", + " -- Daymet covers North America bounding box\n", + " and m.longitude between -178 and -52\n", + " and m.latitude between 14 and 83\n", + " {exists_filter}\n", + " order by m.master_id\n", + " {limit_sql}\n", + " '''\n", + " with get_conn() as conn:\n", + " return pd.read_sql_query(query, conn, params=params)\n", + "\n", + "\n", + "targets = load_target_points(refresh_existing=REFRESH_EXISTING, max_points=MAX_POINTS)\n", + "print(f'Target data centers to fetch: {len(targets):,}')\n", + "display(targets.head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Fetchers and derived-metric helpers\n", + "\n", + "`fetch_daymet` issues one single-pixel REST call for the full 30-year period. `fetch_gridmet_wind` issues one NCSS CSV call per year (Daymet has no equivalent multi-year aggregated endpoint we can rely on; per-year fetches keep each response small).\n", + "\n", + "Wet bulb is derived in two steps:\n", + "1. **RH from Daymet vp** using the Magnus saturation-vapor-pressure formula.\n", + "2. **Wet bulb from T + RH** using the Stull (2011) approximation (valid 5 % ≤ RH ≤ 99 %, −20 °C ≤ T ≤ +50 °C).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "def http_get(url, max_retries=MAX_RETRIES, timeout=REQUEST_TIMEOUT_SECONDS):\n", + " last_error = None\n", + " for attempt in range(1, max_retries + 1):\n", + " try:\n", + " req = urllib.request.Request(url, headers={'User-Agent': 'data-center-climate-notebook/1.1'})\n", + " with urllib.request.urlopen(req, timeout=timeout) as resp:\n", + " return resp.read().decode('utf-8')\n", + " except urllib.error.HTTPError as exc:\n", + " last_error = exc\n", + " try:\n", + " body = exc.read().decode('utf-8', errors='replace')[:200]\n", + " except Exception:\n", + " body = ''\n", + " if exc.code in (400, 404):\n", + " raise RuntimeError(f'HTTP {exc.code} on {url}: {body or exc.reason}')\n", + " sleep_for = min(2 ** attempt, 60)\n", + " print(f' HTTP {exc.code} attempt {attempt}/{max_retries}: {body}; sleeping {sleep_for}s')\n", + " time.sleep(sleep_for)\n", + " except (urllib.error.URLError, TimeoutError) as exc:\n", + " last_error = exc\n", + " sleep_for = min(2 ** attempt, 60)\n", + " print(f' Network error attempt {attempt}/{max_retries}: {exc}; sleeping {sleep_for}s')\n", + " time.sleep(sleep_for)\n", + " raise RuntimeError(f'Request failed after {max_retries} attempt(s) for {url}: {last_error}')\n", + "\n", + "\n", + "def fetch_daymet(lat, lon):\n", + " \"\"\"Return (header_dict, dataframe) for one point covering START_DATE..END_DATE.\"\"\"\n", + " params = {\n", + " 'lat': f'{float(lat):.6f}',\n", + " 'lon': f'{float(lon):.6f}',\n", + " 'vars': ','.join(DAYMET_VARIABLES),\n", + " 'start': START_DATE,\n", + " 'end': END_DATE,\n", + " 'format': 'csv',\n", + " }\n", + " url = DAYMET_API_URL + '?' + urllib.parse.urlencode(params)\n", + " text = http_get(url)\n", + " # Daymet CSV preamble: Latitude / Longitude / X & Y / Tile / Elevation / \"All years; ...\" / cite line\n", + " lines = text.splitlines()\n", + " header = {'url': url}\n", + " data_start = None\n", + " for i, line in enumerate(lines):\n", + " if line.strip().startswith('year,'):\n", + " data_start = i\n", + " break\n", + " if ':' in line:\n", + " k, v = line.split(':', 1)\n", + " header[k.strip()] = v.strip()\n", + " if data_start is None:\n", + " raise RuntimeError(f'Daymet CSV missing column header. First 500 chars: {text[:500]}')\n", + " df = pd.read_csv(StringIO('\\n'.join(lines[data_start:])))\n", + " # Normalize column names: 'prcp (mm/day)' -> 'prcp', etc.\n", + " df.columns = [c.split(' ')[0] for c in df.columns]\n", + " # Daymet uses 365-day calendar (Dec 31 dropped on leap years), yday is 1..365\n", + " df['date'] = pd.to_datetime(df['year'].astype(str), format='%Y') + pd.to_timedelta(df['yday'] - 1, unit='D')\n", + " return header, df\n", + "\n", + "\n", + "def fetch_gridmet_wind(lat, lon):\n", + " \"\"\"Return (meta_dict, dataframe[date, wind_speed_ms]) covering START_YEAR..END_YEAR.\n", + "\n", + " Issues one NCSS CSV request per year (~30 requests, ~3 KB each).\"\"\"\n", + " frames = []\n", + " grid_lat = grid_lon = None\n", + " for year in range(START_YEAR, END_YEAR + 1):\n", + " url = (GRIDMET_NCSS_URL_TEMPLATE.format(year=year) +\n", + " f'?var={GRIDMET_VARIABLE}'\n", + " f'&latitude={float(lat):.6f}&longitude={float(lon):.6f}'\n", + " f'&time_start={year}-01-01T00:00:00Z&time_end={year}-12-31T23:59:59Z'\n", + " f'&accept=csv')\n", + " text = http_get(url)\n", + " df = pd.read_csv(StringIO(text))\n", + " # NCSS column names look like: time, latitude[unit=\"...\"], longitude[..], wind_speed[unit=\"m/s\"]\n", + " df = df.rename(columns={c: c.split('[')[0].strip() for c in df.columns})\n", + " df['date'] = pd.to_datetime(df['time']).dt.tz_localize(None).dt.normalize()\n", + " # NCSS CSV does NOT apply scale_factor for packed shorts; apply here.\n", + " df['wind_speed_ms'] = pd.to_numeric(df['wind_speed'], errors='coerce') * GRIDMET_SCALE_FACTOR\n", + " if grid_lat is None and 'latitude' in df.columns:\n", + " grid_lat = float(df['latitude'].iloc[0])\n", + " grid_lon = float(df['longitude'].iloc[0])\n", + " frames.append(df[['date', 'wind_speed_ms']])\n", + " return {'grid_lat': grid_lat, 'grid_lon': grid_lon}, pd.concat(frames, ignore_index=True)\n", + "\n", + "\n", + "def saturation_vapor_pressure_pa(t_c):\n", + " \"\"\"Magnus / Tetens. T in °C, returns saturation vapor pressure in Pa.\"\"\"\n", + " return 611.2 * np.exp(17.67 * t_c / (t_c + 243.5))\n", + "\n", + "\n", + "def relative_humidity_pct(vp_pa, t_c):\n", + " e_sat = saturation_vapor_pressure_pa(t_c)\n", + " rh = 100.0 * vp_pa / e_sat\n", + " return np.clip(rh, 0.0, 100.0)\n", + "\n", + "\n", + "def wet_bulb_temperature_c(t_c, rh_pct):\n", + " \"\"\"Stull (2011) approximation. Valid for 5 ≤ RH ≤ 99 %, −20 ≤ T ≤ +50 °C.\"\"\"\n", + " rh = np.clip(rh_pct, 5.0, 99.0)\n", + " return (\n", + " t_c * np.arctan(0.151977 * np.sqrt(rh + 8.313659))\n", + " + np.arctan(t_c + rh)\n", + " - np.arctan(rh - 1.676331)\n", + " + 0.00391838 * (rh ** 1.5) * np.arctan(0.023101 * rh)\n", + " - 4.686035\n", + " )\n", + "\n", + "\n", + "def _parse_daymet_header(header):\n", + " def f(s):\n", + " try: return float(str(s).split()[0])\n", + " except Exception: return None\n", + " elevation_m = f(header.get('Elevation', '').replace('meters', '').strip())\n", + " tile = None\n", + " try: tile = int(header.get('Tile', '').strip())\n", + " except Exception: pass\n", + " xy = header.get('X & Y on Lambert Conformal Conic', '')\n", + " daymet_x = daymet_y = None\n", + " if xy:\n", + " parts = xy.split()\n", + " try:\n", + " daymet_x = float(parts[0]); daymet_y = float(parts[1])\n", + " except Exception:\n", + " pass\n", + " return tile, elevation_m, daymet_x, daymet_y\n", + "\n", + "\n", + "def summarize_site(dc_row, daymet_header, daymet_df, wind_info):\n", + " df = daymet_df.copy()\n", + " df['tmean'] = (df['tmin'] + df['tmax']) / 2.0\n", + " df['year'] = df['date'].dt.year\n", + " df['month'] = df['date'].dt.month\n", + "\n", + " df['rh_mean_pct'] = relative_humidity_pct(df['vp'], df['tmean'])\n", + " df['rh_at_tmax_pct'] = relative_humidity_pct(df['vp'], df['tmax'])\n", + " df['wet_bulb_mean_c'] = wet_bulb_temperature_c(df['tmean'], df['rh_mean_pct'])\n", + " df['wet_bulb_max_c'] = wet_bulb_temperature_c(df['tmax'], df['rh_at_tmax_pct'])\n", + "\n", + " df['cdd_c'] = (df['tmean'] - CDD_BASE_C).clip(lower=0)\n", + " df['extreme_heat'] = df['tmax'] >= EXTREME_HEAT_THRESHOLD_C\n", + " df['extreme_wet_bulb'] = df['wet_bulb_max_c'] >= EXTREME_WET_BULB_THRESHOLD_C\n", + " df['dtr_c'] = df['tmax'] - df['tmin']\n", + "\n", + " annual = (df.groupby('year', as_index=False)\n", + " .agg(annual_mean_t_c=('tmean','mean'),\n", + " annual_cdd=('cdd_c','sum'),\n", + " annual_extreme_heat=('extreme_heat','sum'),\n", + " annual_prcp_mm=('prcp','sum')))\n", + " summer = df.loc[df['month'].isin([6, 7, 8])]\n", + " wet_days = df.loc[df['prcp'] >= WET_DAY_THRESHOLD_MM, 'prcp']\n", + "\n", + " annual_prcp = annual['annual_prcp_mm'].dropna()\n", + " precip_cv = float(annual_prcp.std(ddof=1) / annual_prcp.mean()) if len(annual_prcp) > 1 and annual_prcp.mean() != 0 else np.nan\n", + "\n", + " tile, elevation_m, daymet_x, daymet_y = _parse_daymet_header(daymet_header)\n", + "\n", + " row = {\n", + " 'master_id': dc_row.master_id,\n", + " 'source': dc_row.source,\n", + " 'name': dc_row.name,\n", + " 'operator': dc_row.operator,\n", + " 'city': dc_row.city,\n", + " 'state': dc_row.state,\n", + " 'country': dc_row.country,\n", + " 'longitude': float(dc_row.longitude),\n", + " 'latitude': float(dc_row.latitude),\n", + "\n", + " 'daymet_dataset_version': DAYMET_DATASET_VERSION,\n", + " 'gridmet_dataset_version': GRIDMET_DATASET_VERSION if wind_info is not None else None,\n", + " 'climate_period_start': START_DATE,\n", + " 'climate_period_end': END_DATE,\n", + " 'daymet_tile': tile,\n", + " 'daymet_elevation_m': elevation_m,\n", + " 'daymet_grid_x_m': daymet_x,\n", + " 'daymet_grid_y_m': daymet_y,\n", + " 'gridmet_grid_latitude': wind_info['grid_lat'] if wind_info is not None else None,\n", + " 'gridmet_grid_longitude': wind_info['grid_lon'] if wind_info is not None else None,\n", + " 'observation_days': int(len(df)),\n", + " 'observation_years': int(df['year'].nunique()),\n", + "\n", + " 'mean_annual_temperature_c': float(annual['annual_mean_t_c'].mean()),\n", + " 'mean_summer_temperature_c': float(summer['tmean'].mean()) if len(summer) else np.nan,\n", + " 'max_daily_temperature_c': float(df['tmax'].max()),\n", + " 'min_daily_temperature_c': float(df['tmin'].min()),\n", + " 'mean_diurnal_temperature_range_c': float(df['dtr_c'].mean()),\n", + "\n", + " 'mean_relative_humidity_pct': float(df['rh_mean_pct'].mean()),\n", + " 'mean_wet_bulb_temperature_c': float(df['wet_bulb_mean_c'].mean()),\n", + " 'max_wet_bulb_temperature_c': float(df['wet_bulb_max_c'].max()),\n", + " 'extreme_wet_bulb_days': int(df['extreme_wet_bulb'].sum()),\n", + "\n", + " 'cooling_degree_days_c': float(df['cdd_c'].sum()),\n", + " 'annual_cooling_degree_days_c_mean': float(annual['annual_cdd'].mean()),\n", + " 'extreme_heat_days': int(df['extreme_heat'].sum()),\n", + " 'annual_extreme_heat_days_mean': float(annual['annual_extreme_heat'].mean()),\n", + "\n", + " 'precipitation_total_mm': float(df['prcp'].sum()),\n", + " 'annual_precipitation_mm_mean': float(annual['annual_prcp_mm'].mean()),\n", + " 'annual_precipitation_cv': precip_cv,\n", + " 'wet_day_precipitation_p95_mm': float(wet_days.quantile(0.95)) if len(wet_days) else 0.0,\n", + "\n", + " 'mean_wind_speed_ms': None,\n", + " 'max_daily_mean_wind_speed_ms': None,\n", + " 'sustained_wind_days': None,\n", + " 'annual_sustained_wind_days_mean': None,\n", + " 'wind_status': 'pending_gridmet',\n", + "\n", + " 'drought_severity_index_source_status': 'needs_external_source',\n", + " 'wildfire_smoke_exposure_source_status': 'needs_external_source',\n", + "\n", + " 'daymet_variables': DAYMET_VARIABLES,\n", + " 'daymet_url': daymet_header.get('url'),\n", + " }\n", + "\n", + " if wind_info is not None:\n", + " wdf = wind_info['data'].copy()\n", + " wdf['year'] = wdf['date'].dt.year\n", + " wdf['sustained'] = wdf['wind_speed_ms'] >= SUSTAINED_WIND_THRESHOLD_MS\n", + " annual_wind = wdf.groupby('year', as_index=False).agg(d=('sustained', 'sum'))\n", + " row.update({\n", + " 'mean_wind_speed_ms': float(wdf['wind_speed_ms'].mean()),\n", + " 'max_daily_mean_wind_speed_ms': float(wdf['wind_speed_ms'].max()),\n", + " 'sustained_wind_days': int(wdf['sustained'].sum()),\n", + " 'annual_sustained_wind_days_mean': float(annual_wind['d'].mean()),\n", + " 'wind_status': 'gridmet',\n", + " })\n", + "\n", + " return row\n" + ] + }, + { + "cell_type": "markdown", + "id": "11", + "metadata": {}, + "source": [ + "## Upsert Helper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "UPSERT_COLUMNS = [\n", + " 'master_id', 'source', 'name', 'operator', 'city', 'state', 'country',\n", + " 'longitude', 'latitude',\n", + " 'daymet_dataset_version', 'gridmet_dataset_version',\n", + " 'climate_period_start', 'climate_period_end',\n", + " 'daymet_tile', 'daymet_elevation_m', 'daymet_grid_x_m', 'daymet_grid_y_m',\n", + " 'gridmet_grid_latitude', 'gridmet_grid_longitude',\n", + " 'observation_days', 'observation_years',\n", + " 'mean_annual_temperature_c', 'mean_summer_temperature_c',\n", + " 'max_daily_temperature_c', 'min_daily_temperature_c',\n", + " 'mean_diurnal_temperature_range_c',\n", + " 'mean_relative_humidity_pct',\n", + " 'mean_wet_bulb_temperature_c', 'max_wet_bulb_temperature_c',\n", + " 'extreme_wet_bulb_days',\n", + " 'cooling_degree_days_c', 'annual_cooling_degree_days_c_mean',\n", + " 'extreme_heat_days', 'annual_extreme_heat_days_mean',\n", + " 'precipitation_total_mm', 'annual_precipitation_mm_mean',\n", + " 'annual_precipitation_cv', 'wet_day_precipitation_p95_mm',\n", + " 'mean_wind_speed_ms', 'max_daily_mean_wind_speed_ms',\n", + " 'sustained_wind_days', 'annual_sustained_wind_days_mean',\n", + " 'wind_status',\n", + " 'drought_severity_index_source_status',\n", + " 'wildfire_smoke_exposure_source_status',\n", + " 'daymet_variables', 'daymet_url',\n", + "]\n", + "\n", + "\n", + "def clean_db_value(value):\n", + " if isinstance(value, (list, tuple)):\n", + " return list(value)\n", + " if value is None:\n", + " return None\n", + " if isinstance(value, float) and np.isnan(value):\n", + " return None\n", + " try:\n", + " if pd.isna(value):\n", + " return None\n", + " except (TypeError, ValueError):\n", + " pass\n", + " return value\n", + "\n", + "\n", + "def upsert_climate_rows(rows):\n", + " if not rows:\n", + " return\n", + "\n", + " values = [\n", + " tuple(clean_db_value(row.get(col)) for col in UPSERT_COLUMNS)\n", + " for row in rows\n", + " ]\n", + " col_sql = sql.SQL(', ').join(map(sql.Identifier, UPSERT_COLUMNS))\n", + " update_assignments = sql.SQL(', ').join(\n", + " sql.SQL('{} = excluded.{}').format(sql.Identifier(col), sql.Identifier(col))\n", + " for col in UPSERT_COLUMNS if col != 'master_id'\n", + " )\n", + "\n", + " insert_sql = sql.SQL('''\n", + " insert into {table} ({cols}, geom, updated_at)\n", + " values %s\n", + " on conflict (master_id) do update set\n", + " {updates},\n", + " geom = excluded.geom,\n", + " fetched_at = now(),\n", + " updated_at = now()\n", + " ''').format(\n", + " table=sql.SQL(CLIMATE_TABLE),\n", + " cols=col_sql,\n", + " updates=update_assignments,\n", + " )\n", + "\n", + " template = '(' + ', '.join(['%s'] * len(UPSERT_COLUMNS)) + ', ST_SetSRID(ST_MakePoint(%s, %s), 4326), now())'\n", + " lon_i = UPSERT_COLUMNS.index('longitude')\n", + " lat_i = UPSERT_COLUMNS.index('latitude')\n", + " values_with_geom = [r + (r[lon_i], r[lat_i]) for r in values]\n", + "\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " execute_values(cur, insert_sql.as_string(conn), values_with_geom, template=template, page_size=500)\n" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "## Fetch + Upsert (per site, with checkpointing)\n", + "\n", + "Each site is fetched, summarized, **and upserted to Postgres immediately**. That way:\n", + "\n", + "- A mid-run crash never loses more than the current site.\n", + "- Re-running with `REFRESH_EXISTING=False` skips everything already in `data_center_historical_climate` (filtered in `load_target_points`).\n", + "- The CSV checkpoint at `output/historical_climate_checkpoint.csv` is purely a debug artifact; the DB is the source of truth.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "if targets.empty:\n", + " climate_rows = []\n", + " print('No new target rows. Set REFRESH_EXISTING=True to recompute existing rows.')\n", + "else:\n", + " climate_rows = []\n", + " errors = []\n", + " total = len(targets)\n", + " checkpoint_path = Path('output/historical_climate_checkpoint.csv')\n", + " checkpoint_path.parent.mkdir(exist_ok=True)\n", + "\n", + " for i, dc_row in enumerate(targets.itertuples(index=False), start=1):\n", + " try:\n", + " daymet_header, daymet_df = fetch_daymet(dc_row.latitude, dc_row.longitude)\n", + " wind_info = None\n", + " if INCLUDE_WIND:\n", + " grid_meta, wind_df = fetch_gridmet_wind(dc_row.latitude, dc_row.longitude)\n", + " wind_info = {\n", + " 'grid_lat': grid_meta['grid_lat'],\n", + " 'grid_lon': grid_meta['grid_lon'],\n", + " 'data': wind_df,\n", + " }\n", + " row = summarize_site(dc_row, daymet_header, daymet_df, wind_info)\n", + " climate_rows.append(row)\n", + "\n", + " if not DRY_RUN:\n", + " upsert_climate_rows([row])\n", + " pd.DataFrame(climate_rows).to_csv(checkpoint_path, index=False)\n", + "\n", + " print(f'{i:>5,}/{total:,} {dc_row.master_id} '\n", + " f'T_mean={row[\"mean_annual_temperature_c\"]:.2f}°C '\n", + " f'CDD={row[\"cooling_degree_days_c\"]:,.0f} '\n", + " f'wind={row[\"wind_status\"]} '\n", + " f'successes={len(climate_rows):,} errors={len(errors):,}')\n", + " except Exception as exc:\n", + " errors.append({'master_id': dc_row.master_id, 'error': str(exc)})\n", + " print(f'ERROR {i:>5,}/{total:,} ({dc_row.master_id}): {exc}')\n", + "\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + "\n", + " climate = pd.DataFrame(climate_rows)\n", + " error_log = pd.DataFrame(errors)\n", + " print(f'\\nDone: {len(climate):,} successes / {len(error_log):,} errors')\n", + " if not climate.empty:\n", + " display(climate.head())\n", + " if not error_log.empty:\n", + " display(error_log)\n" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Inspect Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "summary_query = f'''\n", + "select\n", + " count(*) as rows,\n", + " min(climate_period_start) as min_period_start,\n", + " max(climate_period_end) as max_period_end,\n", + " count(*) filter (where wind_status = 'gridmet') as with_wind,\n", + " count(*) filter (where wind_status = 'pending_gridmet') as wind_pending,\n", + " count(*) filter (where drought_severity_index_source_status = 'needs_external_source') as drought_external_needed,\n", + " count(*) filter (where wildfire_smoke_exposure_source_status = 'needs_external_source') as smoke_external_needed\n", + "from {CLIMATE_TABLE}\n", + "where daymet_dataset_version = %s\n", + " and climate_period_start = %s::date\n", + " and climate_period_end = %s::date\n", + "'''\n", + "\n", + "sample_query = f'''\n", + "select\n", + " c.master_id, c.name, c.state,\n", + " round(c.mean_annual_temperature_c::numeric, 2) as mean_t_c,\n", + " round(c.mean_summer_temperature_c::numeric, 2) as summer_t_c,\n", + " round(c.mean_relative_humidity_pct::numeric, 1) as rh_pct,\n", + " round(c.mean_wet_bulb_temperature_c::numeric, 2) as wb_mean_c,\n", + " round(c.cooling_degree_days_c::numeric, 0) as cdd_total,\n", + " c.extreme_heat_days,\n", + " round(c.annual_precipitation_cv::numeric, 3) as precip_cv,\n", + " round(c.mean_wind_speed_ms::numeric, 2) as wind_mean_ms,\n", + " c.sustained_wind_days,\n", + " c.wind_status\n", + "from {CLIMATE_TABLE} c\n", + "order by c.cooling_degree_days_c desc nulls last\n", + "limit 20\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " summary = pd.read_sql_query(summary_query, conn, params=(DAYMET_DATASET_VERSION, START_DATE, END_DATE))\n", + " sample = pd.read_sql_query(sample_query, conn)\n", + "\n", + "display(summary)\n", + "display(sample)\n" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, + "source": [ + "## Notes for the Next Data Sources\n", + "\n", + "Daymet (T/precip/VP at 1 km) + gridMET (wind at 4 km) cover the core weather/climate burden metrics for CONUS data centers. Two metrics still need better external sources:\n", + "\n", + "- **Drought severity index** — gridMET also publishes `pdsi` (Palmer Drought Severity Index) and `etr` (reference ET) at the same THREDDS server. A follow-up cell can pull PDSI percentiles using the same NCSS pattern as `vs` here. Alternatively, the US Drought Monitor (USDM) shapefiles give weekly drought categories that can be spatially joined to the DC point.\n", + "- **Wildfire smoke exposure** — NOAA HMS smoke shapefiles (daily plume polygons, 2005–present) or EPA AirNow PM2.5 monitor data. Smoke-day counts would be directly comparable to the heat-day metric here.\n", + "\n", + "**Metric change vs. the prior Open-Meteo implementation.** The old `windstorm_days` field counted days where the daily *max wind gust* exceeded 17.2 m/s (~38 mph, ERA5 reanalysis gust). gridMET reports only daily *mean* wind, not gusts, so the new `sustained_wind_days` field counts days with daily-mean wind ≥ 8 m/s (~18 mph). These are fundamentally different signals — do not compare values across the two columns. If true gust/storm event counts are needed, NOAA's Storm Events Database is the appropriate next source.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.14.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/output/historical_climate_checkpoint.csv b/output/historical_climate_checkpoint.csv new file mode 100644 index 0000000..8ff66aa --- /dev/null +++ b/output/historical_climate_checkpoint.csv @@ -0,0 +1,14 @@ +master_id,source,name,operator,city,state,country,longitude,latitude,daymet_dataset_version,gridmet_dataset_version,climate_period_start,climate_period_end,daymet_tile,daymet_elevation_m,daymet_grid_x_m,daymet_grid_y_m,gridmet_grid_latitude,gridmet_grid_longitude,observation_days,observation_years,mean_annual_temperature_c,mean_summer_temperature_c,max_daily_temperature_c,min_daily_temperature_c,mean_diurnal_temperature_range_c,mean_relative_humidity_pct,mean_wet_bulb_temperature_c,max_wet_bulb_temperature_c,extreme_wet_bulb_days,cooling_degree_days_c,annual_cooling_degree_days_c_mean,extreme_heat_days,annual_extreme_heat_days_mean,precipitation_total_mm,annual_precipitation_mm_mean,annual_precipitation_cv,wet_day_precipitation_p95_mm,mean_wind_speed_ms,max_daily_mean_wind_speed_ms,sustained_wind_days,annual_sustained_wind_days_mean,wind_status,drought_severity_index_source_status,wildfire_smoke_exposure_source_status,daymet_variables,daymet_url +curated/0334264478,merged,Terminal Commerce Building,,Philadelphia,PA,United States,-75.16083249176855,39.959809526359834,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11573,24.0,1997266.57,30406.34,39.95981,-75.160832,10950,30,13.0764200913242,24.108543478260867,39.26,-21.31,10.672155251141554,70.36852671711473,9.856640970016384,29.9621687216794,29,20429.369999999995,680.9789999999999,136,4.533333333333333,38118.32,1270.6106666666667,0.18090604364227492,31.89199999999997,4.084504471618909,12.5,319.0,10.633333333333333,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=39.959810&lon=-75.160832&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/0335468616,merged,TierPoint,,,WA,United States,-117.09593477622663,47.67191686714767,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,12272,630.0,-1219024.56,673810.47,47.671917,-117.095935,10950,30,9.016546575342465,19.093291666666666,40.09,-24.9,11.943931506849315,60.23195446572226,4.710395504299232,23.37884791052112,0,6053.609999999999,201.78699999999992,138,4.6,17468.8,582.2933333333333,0.1747350658066581,14.115999999999985,3.2801697390034685,11.5,70.0,2.3333333333333335,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=47.671917&lon=-117.095935&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/0336387583,merged,Baltimore Technology Park,,Baltimore,MD,United States,-76.62552663391301,39.27538391165092,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11572,9.0,1902773.82,-73433.06,39.275384,-76.625527,10950,30,14.10395799086758,24.99128079710145,39.98,-20.47,10.228728767123286,71.35921072562688,10.911169237092297,30.943426341757874,42,24231.6,807.72,223,7.433333333333334,34409.44,1146.9813333333332,0.20849552531726018,34.736,3.715641540427085,13.5,198.0,6.6,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=39.275384&lon=-76.625527&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/0337126739,merged,Intergate Seattle West Building C,Sabey,Tukwila,WA,United States,-122.29440690516351,47.494131187713045,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,12269,76.0,-1586884.55,742889.74,47.494131,-122.294407,10950,30,11.485044292237442,18.09163224637681,39.32,-9.97,8.688176255707763,71.0624487473783,8.27376698621232,23.00645940700761,0,3180.924999999999,106.03083333333329,10,0.3333333333333333,32506.78,1083.5593333333334,0.176673934909212,21.625499999999985,4.2742288738820955,15.5,893.0,29.766666666666666,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=47.494131&lon=-122.294407&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/0337126741,merged,Intergate Seattle West Building B,Sabey,Tukwila,WA,United States,-122.29368392327885,47.49307629033908,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,12269,76.0,-1586863.93,742767.84,47.494131,-122.294407,10950,30,11.485044292237442,18.09163224637681,39.32,-9.97,8.688176255707763,71.0624487473783,8.27376698621232,23.00645940700761,0,3180.924999999999,106.03083333333329,10,0.3333333333333333,32506.78,1083.5593333333334,0.176673934909212,21.625499999999985,4.2742288738820955,15.5,893.0,29.766666666666666,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=47.493076&lon=-122.293684&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/10014176940,merged,Microsoft,Microsoft,Guaynabo,PR,United States,-66.10898950000002,18.417435900000033,daymet_v4r1,,1991-01-01,2020-12-31,9777,16.0,3621410.85,-1879111.77,,,10950,30,25.81547488584475,27.144874999999992,34.91,15.07,9.11396894977169,75.78506213352443,22.51809641313038,27.63820127157118,0,82294.45,2743.148333333333,0,0.0,54395.869999999995,1813.195666666667,0.18200132344404926,28.888999999999996,,,,,unavailable_outside_conus,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=18.417436&lon=-66.108990&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/1219083552,merged,,,,IL,United States,-87.97218622171745,42.02314332706368,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11927,213.0,946754.31,17864.2,42.023143,-87.972186,10950,30,9.97390776255708,22.144297101449272,39.74,-30.46,10.23993789954338,70.50654029097984,6.957823466604632,29.99370463545738,7,13932.794999999998,464.4264999999999,49,1.6333333333333333,30357.16,1011.9053333333334,0.16092047671431148,29.023999999999976,4.649790107683884,13.5,541.0,18.033333333333335,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=42.023143&lon=-87.972186&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/1219083554,merged,T5 Chicago II,T5,Elk Grove Village,IL,United States,-87.97555687597257,42.0230423332657,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11927,213.0,946492.34,17815.36,42.023143,-87.972186,10950,30,9.97390776255708,22.144297101449272,39.74,-30.46,10.23993789954338,70.50654029097984,6.957823466604632,29.99370463545738,7,13932.794999999998,464.4264999999999,49,1.6333333333333333,30357.16,1011.9053333333334,0.16092047671431148,29.023999999999976,4.649790107683884,13.5,541.0,18.033333333333335,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=42.023042&lon=-87.975557&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/1221862807,merged,AWS IAD106,Amazon Web Services,,VA,United States,-77.51852812758844,38.74697497598997,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11572,63.0,1846819.9,-147489.22,38.746975,-77.518528,10950,30,13.476520547945201,24.138014492753623,39.12,-21.22,11.741119634703196,67.90200557268788,9.986608175884161,28.819002377816997,11,20837.155,694.5718333333332,158,5.266666666666667,35247.11,1174.9036666666666,0.18100162932466146,31.305999999999983,3.9398339112976823,15.8,251.0,8.366666666666667,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=38.746975&lon=-77.518528&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/1222007663,merged,Google,Google,,OH,United States,-82.76452657909626,40.05963281203005,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11749,331.0,1394405.54,-114024.63,40.059633,-82.764527,10950,30,10.846063470319633,21.955298913043478,36.59,-32.84,11.388786301369864,68.46078792715129,7.576812559382761,26.776242217926022,0,13165.859999999997,438.86199999999985,10,0.3333333333333333,35712.899999999994,1190.4299999999998,0.13646519991876516,25.45099999999998,4.499023544442417,13.4,511.0,17.033333333333335,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=40.059633&lon=-82.764527&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/9721703993,merged,TierPoint Milwaukee,,,WI,United States,-87.9610618,43.00937789999995,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11927,200.0,932598.66,121371.64,43.009378,-87.961062,10950,30,8.919163013698629,21.042980072463767,40.5,-32.12,9.755436529680365,71.38282568630439,6.0722708536298375,28.858298868205022,3,10552.029999999999,351.7343333333334,24,0.8,27385.6,912.8533333333336,0.15443990630439305,26.619999999999944,5.007145464500821,14.200000000000001,902.0,30.066666666666666,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=43.009378&lon=-87.961062&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +curated/9736262934,merged,Frontier Science,,Amherst,NY,United States,-78.8123836,42.990201099999986,daymet_v4r1,gridmet_vs,1991-01-01,2020-12-31,11931,180.0,1629871.78,260041.46,42.990201,-78.812384,10950,30,9.30234885844749,20.74546739130435,35.38,-24.55,9.541263926940639,72.05315950094813,6.4648431777879365,26.91148035798725,0,9479.474999999999,315.98249999999996,1,0.03333333333333333,30978.7,1032.6233333333334,0.13719318866566355,21.611499999999992,4.415814929731703,14.700000000000001,654.0,21.8,gridmet,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=42.990201&lon=-78.812384&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv +osm/way/365941471,osm,,Claro,,PR,US,-66.0516873,18.446827,daymet_v4r1,,1991-01-01,2020-12-31,9777,11.0,3625853.61,-1873516.13,,,10950,30,25.889257534246575,27.205469202898552,35.01,15.38,8.787603652968036,76.53890427533007,22.69437372495814,27.817986972115694,0,83102.37,2770.0789999999993,1,0.03333333333333333,53693.689999999995,1789.7896666666666,0.17483046745643102,28.772000000000006,,,,,unavailable_outside_conus,needs_external_source,needs_external_source,"['tmin', 'tmax', 'prcp', 'vp']",https://daymet.ornl.gov/single-pixel/api/data?lat=18.446827&lon=-66.051687&vars=tmin%2Ctmax%2Cprcp%2Cvp&start=1991-01-01&end=2020-12-31&format=csv diff --git a/rdh_precinct_vote_data_centers.ipynb b/rdh_precinct_vote_data_centers.ipynb new file mode 100644 index 0000000..bd60495 --- /dev/null +++ b/rdh_precinct_vote_data_centers.ipynb @@ -0,0 +1,1210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# RDH Precinct Vote Data for Master Data Centers\n", + "\n", + "This notebook discovers, downloads, stages, and spatially joins Redistricting Data Hub precinct-level election data to `public.master_data_centers`.\n", + "\n", + "It is designed to be rerunnable as new data centers are added:\n", + "- Data-center locations come from `public.master_data_centers`.\n", + "- RDH credentials are read from `RDH_USERNAME` / `RDH_PASSWORD` or prompted securely with `getpass`.\n", + "- RDH files are downloaded into `data/rdh_precinct_vote/`.\n", + "- Original precinct attributes are preserved in `properties jsonb` because RDH vote-column names vary by state/year/election.\n", + "- Matches are written to `public.data_center_rdh_precinct_vote_matches`, joinable by `master_id`.\n", + "\n", + "Primary join method is point-in-precinct using longitude/latitude. Census tract context is included as a fallback/diagnostic path when the existing census tract table is available.\n", + "\n", + "Local RDH API reference used for this notebook:\n", + "- `/home/dadams/Repos/api-redistricting_datahub/RDH_API.ipynb`\n", + "- `/home/dadams/Repos/api-redistricting_datahub/RDH_API_SET_PARAMS.ipynb`\n", + "- `/home/dadams/Repos/api-redistricting_datahub/Download_API.txt`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib\n", + "import io\n", + "import json\n", + "import os\n", + "import re\n", + "import shutil\n", + "import subprocess\n", + "import time\n", + "import zipfile\n", + "from getpass import getpass\n", + "from pathlib import Path\n", + "from urllib.parse import parse_qs, urlparse, unquote\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import psycopg2\n", + "import requests\n", + "from psycopg2 import sql\n", + "from psycopg2.extras import Json, execute_values\n", + "\n", + "try:\n", + " import geopandas as gpd\n", + " HAS_GEOPANDAS = True\n", + "except ImportError:\n", + " gpd = None\n", + " HAS_GEOPANDAS = False\n", + "\n", + "pd.set_option('display.max_columns', 120)\n", + "pd.set_option('display.max_rows', 120)\n", + "\n", + "print('pandas:', pd.__version__)\n", + "print('requests:', requests.__version__)\n", + "print('geopandas:', 'ok' if HAS_GEOPANDAS else 'not installed; spatial file loading cells will be skipped')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "def load_env_file(env_path: str = '.env') -> None:\n", + " p = Path(env_path)\n", + " if not p.exists():\n", + " print(f'No {env_path} file found in {Path.cwd()}')\n", + " return\n", + "\n", + " loaded = 0\n", + " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", + " line = raw_line.strip()\n", + " if not line or line.startswith('#') or '=' not in line:\n", + " continue\n", + " key, value = line.split('=', 1)\n", + " key = key.strip()\n", + " value = value.strip().strip('\"').strip(\"'\")\n", + " if key and key not in os.environ:\n", + " os.environ[key] = value\n", + " loaded += 1\n", + " print(f'Loaded {loaded} env var(s) from {env_path}')\n", + "\n", + "\n", + "def require_env(keys):\n", + " missing = [k for k in keys if not os.getenv(k)]\n", + " if missing:\n", + " raise EnvironmentError(\n", + " 'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n", + " '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n", + " )\n", + "\n", + "\n", + "load_env_file('.env')\n", + "require_env(['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD'])\n", + "\n", + "DB_NAME = 'data_centers'\n", + "MASTER_TABLE = 'public.master_data_centers'\n", + "CENSUS_TRACT_TABLE = 'public.data_center_census_tracts_2024'\n", + "\n", + "LAYER_TABLE = 'public.rdh_precinct_vote_layers'\n", + "FEATURE_TABLE = 'public.rdh_precinct_vote_features'\n", + "MATCH_TABLE = 'public.data_center_rdh_precinct_vote_matches'\n", + "\n", + "\n", + "def get_conn():\n", + " return psycopg2.connect(\n", + " host=os.environ['PGWEB_HOST'],\n", + " port=os.environ['PGWEB_PORT'],\n", + " user=os.environ['PGWEB_USER'],\n", + " password=os.environ['PGWEB_PASSWORD'],\n", + " dbname=DB_NAME,\n", + " )\n", + "\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('select current_database(), current_user')\n", + " db, usr = cur.fetchone()\n", + " print('Connected to DB:', db)\n", + " print('As user:', usr)\n", + " cur.execute('create extension if not exists postgis')\n", + " cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n", + " if cur.fetchone()[0] is None:\n", + " raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n", + " cur.execute(sql.SQL('select count(*) from {}').format(sql.SQL(MASTER_TABLE)))\n", + " print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## Parameters\n", + "\n", + "The defaults run a small real pilot for Virginia 2020, because Virginia has many data centers in the master table and a statewide precinct layer should produce visible matches. After the pilot works, broaden `TARGET_STATES` and `FILTER_YEARS_ANY`. Use `TARGET_STATES = None` to infer all states from `public.master_data_centers`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "RDH_LIST_URL = 'https://redistrictingdatahub.org/wp-json/download/list'\n", + "\n", + "DATA_DIR = Path('data/rdh_precinct_vote')\n", + "RAW_DIR = DATA_DIR / 'raw'\n", + "EXTRACT_DIR = DATA_DIR / 'extracted'\n", + "MANIFEST_PATH = DATA_DIR / 'rdh_precinct_vote_download_manifest.csv'\n", + "LISTING_CACHE_PATH = DATA_DIR / 'rdh_precinct_vote_listing_cache.csv'\n", + "\n", + "TARGET_STATES = None # None = infer all states from master_data_centers; or list e.g. ['VA','TX']\n", + "FILTER_TERMS_ALL = ['election results', 'precinct']\n", + "FILTER_TERMS_ANY = [] # e.g. ['general', 'president']\n", + "FILTER_YEARS_ANY = ['2020'] # pilot first; empty keeps all years returned by RDH\n", + "PREFERRED_FORMATS = ['SHP'] # point-in-precinct joins need spatial files\n", + "\n", + "DOWNLOAD_FILES = True\n", + "OVERWRITE_DOWNLOADS = False\n", + "LOAD_TO_POSTGIS = True\n", + "RUN_SPATIAL_MATCH = True\n", + "RUN_NEAREST_PRECINCT_FALLBACK = True\n", + "NEAREST_PRECINCT_MAX_DISTANCE_M = 500\n", + "\n", + "REQUEST_SLEEP_SECONDS = 1.0\n", + "\n", + "for p in [DATA_DIR, RAW_DIR, EXTRACT_DIR]:\n", + " p.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print('Data directory:', DATA_DIR.resolve())\n", + "print('Download files:', DOWNLOAD_FILES)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "STATE_NAME_TO_CODE = {\n", + " 'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',\n", + " 'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',\n", + " 'district of columbia': 'DC', 'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI',\n", + " 'idaho': 'ID', 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA',\n", + " 'kansas': 'KS', 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME',\n", + " 'maryland': 'MD', 'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN',\n", + " 'mississippi': 'MS', 'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE',\n", + " 'nevada': 'NV', 'new hampshire': 'NH', 'new jersey': 'NJ', 'new mexico': 'NM',\n", + " 'new york': 'NY', 'north carolina': 'NC', 'north dakota': 'ND', 'ohio': 'OH',\n", + " 'oklahoma': 'OK', 'oregon': 'OR', 'pennsylvania': 'PA', 'rhode island': 'RI',\n", + " 'south carolina': 'SC', 'south dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX',\n", + " 'utah': 'UT', 'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA',\n", + " 'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY',\n", + "}\n", + "STATE_CODE_TO_NAME = {v: k for k, v in STATE_NAME_TO_CODE.items()}\n", + "\n", + "\n", + "def normalize_state_code(value):\n", + " if pd.isna(value) or str(value).strip() == '':\n", + " return None\n", + " raw = str(value).strip()\n", + " upper = raw.upper()\n", + " if upper in STATE_CODE_TO_NAME:\n", + " return upper\n", + " return STATE_NAME_TO_CODE.get(raw.lower())\n", + "\n", + "\n", + "def infer_target_states():\n", + " if TARGET_STATES:\n", + " return sorted({normalize_state_code(s) for s in TARGET_STATES if normalize_state_code(s)})\n", + " query = f'''\n", + " select state, count(*) as rows\n", + " from {MASTER_TABLE}\n", + " where geom is not null\n", + " group by state\n", + " order by state\n", + " '''\n", + " with get_conn() as conn:\n", + " states = pd.read_sql_query(query, conn)\n", + " states['state_code'] = states['state'].map(normalize_state_code)\n", + " display(states)\n", + " missing = states.loc[states['state_code'].isna() & states['state'].notna()]\n", + " if not missing.empty:\n", + " print('Warning: could not normalize these state values:')\n", + " display(missing)\n", + " return sorted(states['state_code'].dropna().unique())\n", + "\n", + "\n", + "target_states = infer_target_states()\n", + "print('Target state codes:', ', '.join(target_states))\n" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "## RDH Credentials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "RDH_USERNAME = os.getenv('RDH_USERNAME') or os.getenv('RDH_EMAIL')\n", + "RDH_PASSWORD = os.getenv('RDH_PASSWORD')\n", + "\n", + "if not RDH_USERNAME:\n", + " RDH_USERNAME = input('RDH username or email: ').strip()\n", + "if not RDH_PASSWORD:\n", + " RDH_PASSWORD = getpass('RDH password: ')\n", + "\n", + "if not RDH_USERNAME or not RDH_PASSWORD:\n", + " raise RuntimeError('RDH credentials are required. Use prompts or set RDH_USERNAME/RDH_PASSWORD.')\n", + "\n", + "print('RDH username loaded:', RDH_USERNAME)\n", + "print('RDH password loaded:', bool(RDH_PASSWORD))\n" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "## Discover RDH Datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "def get_rdh_list_for_state(state_code):\n", + " params = {\n", + " 'username': RDH_USERNAME,\n", + " 'password': RDH_PASSWORD,\n", + " 'format': 'csv',\n", + " 'states': STATE_CODE_TO_NAME.get(state_code, state_code).lower(),\n", + " }\n", + " response = requests.get(RDH_LIST_URL, params=params, timeout=120)\n", + " response.raise_for_status()\n", + " text = response.content.decode('utf-8', errors='replace')\n", + " df = pd.read_csv(io.StringIO(text))\n", + " df['query_state_code'] = state_code\n", + " df['query_state_name'] = STATE_CODE_TO_NAME.get(state_code, state_code)\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + " return df\n", + "\n", + "\n", + "def load_or_fetch_rdh_listing(refresh=False):\n", + " cached = None\n", + " if LISTING_CACHE_PATH.exists() and not refresh:\n", + " cached = pd.read_csv(LISTING_CACHE_PATH)\n", + " cached_states = set(cached.get('query_state_code', pd.Series(dtype=str)).dropna().unique())\n", + " missing = [s for s in target_states if s not in cached_states]\n", + " print(f'Cache has {len(cached):,} rows across {len(cached_states)} state(s); missing: {missing or \"none\"}')\n", + " else:\n", + " cached_states = set()\n", + " missing = list(target_states)\n", + "\n", + " if missing:\n", + " frames = [] if cached is None else [cached]\n", + " for state_code in missing:\n", + " print('Retrieving RDH listing for', state_code)\n", + " frames.append(get_rdh_list_for_state(state_code))\n", + " cached = pd.concat(frames, ignore_index=True)\n", + " LISTING_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)\n", + " cached.to_csv(LISTING_CACHE_PATH, index=False)\n", + " print('Updated listing cache:', LISTING_CACHE_PATH)\n", + "\n", + " if cached is not None and 'query_state_code' in cached.columns:\n", + " cached = cached[cached['query_state_code'].isin(target_states)].copy()\n", + " return cached if cached is not None else pd.DataFrame()\n", + "\n", + "\n", + "listing = load_or_fetch_rdh_listing(refresh=False)\n", + "print(f'RDH listing rows for target states: {len(listing):,}')\n", + "display(listing.head())\n", + "display(pd.DataFrame({'column': listing.columns}))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "def text_contains_all(text, terms):\n", + " text = str(text).lower()\n", + " return all(term.lower() in text for term in terms if term)\n", + "\n", + "\n", + "def text_contains_any(text, terms):\n", + " terms = [term for term in terms if term]\n", + " if not terms:\n", + " return True\n", + " text = str(text).lower()\n", + " return any(term.lower() in text for term in terms)\n", + "\n", + "\n", + "def row_year_match(text):\n", + " if not FILTER_YEARS_ANY:\n", + " return True\n", + " text = str(text)\n", + " return any(str(year) in text for year in FILTER_YEARS_ANY)\n", + "\n", + "\n", + "def parse_datasetid(url):\n", + " if pd.isna(url):\n", + " return None\n", + " parsed = urlparse(str(url))\n", + " qs = parse_qs(parsed.query)\n", + " values = qs.get('datasetid')\n", + " return values[0] if values else None\n", + "\n", + "\n", + "def clean_filename_from_url(url, fmt):\n", + " base = unquote(str(url).split('?')[0]).rstrip('/')\n", + " name = base.split('/')[-1]\n", + " if not name:\n", + " name = 'rdh_download.zip'\n", + " suffix = Path(name).suffix\n", + " if not suffix:\n", + " suffix = '.zip' if str(fmt).upper() == 'SHP' else ''\n", + " name = name + suffix\n", + " return re.sub(r'[^A-Za-z0-9._-]+', '_', name)\n", + "\n", + "\n", + "work = listing.copy()\n", + "for required in ['Title', 'Format', 'URL']:\n", + " if required not in work.columns:\n", + " raise RuntimeError(f'RDH listing is missing expected column: {required}')\n", + "\n", + "work['title_format'] = work['Title'].fillna('').astype(str) + ' ' + work['Format'].fillna('').astype(str)\n", + "work['datasetid'] = work['URL'].map(parse_datasetid)\n", + "work['format_norm'] = work['Format'].fillna('').astype(str).str.upper()\n", + "work['filename'] = work.apply(lambda r: clean_filename_from_url(r['URL'], r['Format']), axis=1)\n", + "\n", + "filtered = work[\n", + " work['title_format'].map(lambda x: text_contains_all(x, FILTER_TERMS_ALL))\n", + " & work['title_format'].map(lambda x: text_contains_any(x, FILTER_TERMS_ANY))\n", + " & work['title_format'].map(row_year_match)\n", + " & work['format_norm'].isin(PREFERRED_FORMATS)\n", + "].copy()\n", + "\n", + "filtered = filtered.sort_values(['query_state_code', 'Title', 'Format', 'filename']).reset_index(drop=True)\n", + "print(f'Filtered candidate files: {len(filtered):,}')\n", + "display(filtered[['query_state_code', 'Title', 'Format', 'datasetid', 'filename', 'URL']].head(100))\n" + ] + }, + { + "cell_type": "markdown", + "id": "11", + "metadata": {}, + "source": [ + "## Download Candidate Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "def layer_id_for_row(row):\n", + " key = '|'.join([\n", + " str(row.get('query_state_code', '')),\n", + " str(row.get('Title', '')),\n", + " str(row.get('Format', '')),\n", + " str(row.get('datasetid', '')),\n", + " str(row.get('URL', '')),\n", + " ])\n", + " return hashlib.sha1(key.encode('utf-8')).hexdigest()[:16]\n", + "\n", + "\n", + "def download_rdh_file(row):\n", + " base_url = str(row['URL']).split('?')[0]\n", + " params = {\n", + " 'username': RDH_USERNAME,\n", + " 'password': RDH_PASSWORD,\n", + " }\n", + " if row.get('datasetid'):\n", + " params['datasetid'] = row['datasetid']\n", + "\n", + " state_dir = RAW_DIR / row['query_state_code']\n", + " state_dir.mkdir(parents=True, exist_ok=True)\n", + " out_path = state_dir / row['filename']\n", + " if out_path.exists() and not OVERWRITE_DOWNLOADS:\n", + " return out_path, 'exists'\n", + "\n", + " response = requests.get(base_url, params=params, timeout=300)\n", + " response.raise_for_status()\n", + " out_path.write_bytes(response.content)\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + " return out_path, 'downloaded'\n", + "\n", + "\n", + "download_rows = []\n", + "for row in filtered.to_dict('records'):\n", + " layer_id = layer_id_for_row(row)\n", + " out_path = RAW_DIR / row['query_state_code'] / row['filename']\n", + " status = 'planned'\n", + " if DOWNLOAD_FILES:\n", + " print('Retrieving', row['query_state_code'], row['Title'], row['filename'])\n", + " out_path, status = download_rdh_file(row)\n", + " elif out_path.exists():\n", + " status = 'exists'\n", + " download_rows.append({\n", + " 'layer_id': layer_id,\n", + " 'state_code': row['query_state_code'],\n", + " 'title': row['Title'],\n", + " 'format': row['Format'],\n", + " 'datasetid': row.get('datasetid'),\n", + " 'source_url': row['URL'],\n", + " 'filename': row['filename'],\n", + " 'local_path': str(out_path),\n", + " 'download_status': status,\n", + " })\n", + "\n", + "manifest = pd.DataFrame(download_rows)\n", + "manifest.to_csv(MANIFEST_PATH, index=False)\n", + "print('Wrote manifest:', MANIFEST_PATH)\n", + "display(manifest.head(100))\n", + "display(manifest['download_status'].value_counts(dropna=False).rename_axis('status').reset_index(name='rows'))\n", + "\n", + "if not DOWNLOAD_FILES and not manifest['local_path'].map(lambda p: Path(str(p)).exists()).any():\n", + " print('No RDH shapefiles have been downloaded yet. Review the candidates above, then set DOWNLOAD_FILES=True and rerun this cell and the cells below.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "## Unpack and Find Spatial Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_archive(path, layer_id):\n", + " path = Path(path)\n", + " extract_to = EXTRACT_DIR / layer_id\n", + " extract_to.mkdir(parents=True, exist_ok=True)\n", + " if path.suffix.lower() == '.zip':\n", + " with zipfile.ZipFile(path) as zf:\n", + " zf.extractall(extract_to)\n", + " else:\n", + " shutil.copy2(path, extract_to / path.name)\n", + " return extract_to\n", + "\n", + "\n", + "def find_spatial_file(folder):\n", + " folder = Path(folder)\n", + " candidates = (\n", + " list(folder.rglob('*.shp')) +\n", + " list(folder.rglob('*.geojson')) +\n", + " list(folder.rglob('*.gpkg'))\n", + " )\n", + " if not candidates:\n", + " return None\n", + " candidates = sorted(candidates, key=lambda p: (p.suffix.lower() != '.shp', len(str(p))))\n", + " return candidates[0]\n", + "\n", + "\n", + "if MANIFEST_PATH.exists():\n", + " manifest = pd.read_csv(MANIFEST_PATH)\n", + "\n", + "spatial_rows = []\n", + "for row in manifest.to_dict('records'):\n", + " local_path = Path(str(row['local_path']))\n", + " if not local_path.exists():\n", + " spatial_rows.append({**row, 'extract_dir': None, 'spatial_path': None, 'spatial_status': 'missing_download'})\n", + " continue\n", + " extract_dir = extract_archive(local_path, row['layer_id'])\n", + " spatial_path = find_spatial_file(extract_dir)\n", + " spatial_rows.append({\n", + " **row,\n", + " 'extract_dir': str(extract_dir),\n", + " 'spatial_path': str(spatial_path) if spatial_path else None,\n", + " 'spatial_status': 'found' if spatial_path else 'no_spatial_file',\n", + " })\n", + "\n", + "spatial_manifest = pd.DataFrame(spatial_rows)\n", + "spatial_manifest.to_csv(DATA_DIR / 'rdh_precinct_vote_spatial_manifest.csv', index=False)\n", + "display(spatial_manifest[['state_code', 'title', 'filename', 'spatial_status', 'spatial_path']].head(100))\n", + "display(spatial_manifest['spatial_status'].value_counts(dropna=False).rename_axis('status').reset_index(name='rows'))\n", + "\n", + "if not spatial_manifest['spatial_status'].eq('found').any():\n", + " print('No spatial files were found. If spatial_status is missing_download, set DOWNLOAD_FILES=True and rerun the download/unpack cells.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Create PostGIS Staging Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "CREATE_TABLES_SQL = f'''\n", + "create table if not exists {LAYER_TABLE} (\n", + " layer_id text primary key,\n", + " state_code text,\n", + " title text,\n", + " format text,\n", + " datasetid text,\n", + " source_url text,\n", + " filename text,\n", + " local_path text,\n", + " spatial_path text,\n", + " metadata jsonb,\n", + " loaded_at timestamptz not null default now()\n", + ");\n", + "\n", + "create table if not exists {FEATURE_TABLE} (\n", + " feature_id text primary key,\n", + " layer_id text not null references public.rdh_precinct_vote_layers(layer_id) on delete cascade,\n", + " state_code text,\n", + " source_row integer,\n", + " properties jsonb,\n", + " geom geometry(MultiPolygon, 4326)\n", + ");\n", + "\n", + "create index if not exists rdh_precinct_vote_features_geom_gix\n", + " on {FEATURE_TABLE} using gist (geom);\n", + "create index if not exists rdh_precinct_vote_features_layer_idx\n", + " on {FEATURE_TABLE} (layer_id);\n", + "create index if not exists rdh_precinct_vote_features_state_idx\n", + " on {FEATURE_TABLE} (state_code);\n", + "\n", + "create table if not exists {MATCH_TABLE} (\n", + " master_id text not null references public.master_data_centers(master_id) on delete cascade,\n", + " feature_id text not null references public.rdh_precinct_vote_features(feature_id) on delete cascade,\n", + " layer_id text not null references public.rdh_precinct_vote_layers(layer_id) on delete cascade,\n", + " state_code text,\n", + " join_method text not null,\n", + " match_distance_m double precision,\n", + " matched_at timestamptz not null default now(),\n", + " primary key (master_id, feature_id)\n", + ");\n", + "create index if not exists data_center_rdh_precinct_vote_matches_master_idx\n", + " on {MATCH_TABLE} (master_id);\n", + "create index if not exists data_center_rdh_precinct_vote_matches_layer_idx\n", + " on {MATCH_TABLE} (layer_id);\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(CREATE_TABLES_SQL)\n", + " print('PostGIS staging tables are ready.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, + "source": [ + "## Load Precinct Geometries to PostGIS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "def json_safe(value):\n", + " if isinstance(value, (pd.Timestamp, np.datetime64)):\n", + " return str(value)\n", + " if isinstance(value, np.generic):\n", + " return value.item()\n", + " if pd.isna(value):\n", + " return None\n", + " return value\n", + "\n", + "\n", + "def stable_feature_id(layer_id, source_row, geom):\n", + " geom_hash = hashlib.sha1(geom.wkb).hexdigest()[:16] if geom is not None else 'nogeom'\n", + " return hashlib.sha1(f'{layer_id}|{source_row}|{geom_hash}'.encode('utf-8')).hexdigest()[:24]\n", + "\n", + "\n", + "def load_layer_to_postgis_with_ogr(row):\n", + " spatial_path = row.get('spatial_path')\n", + " if not spatial_path or not Path(spatial_path).exists():\n", + " return {'layer_id': row['layer_id'], 'status': 'missing_spatial_file', 'features': 0}\n", + "\n", + " ogr2ogr_path = shutil.which('ogr2ogr')\n", + " if not ogr2ogr_path:\n", + " return {'layer_id': row['layer_id'], 'status': 'missing_geopandas_and_ogr2ogr', 'features': 0}\n", + "\n", + " staging_table = '_rdh_import_' + re.sub(r'[^a-zA-Z0-9_]', '_', row['layer_id']).lower()\n", + " dsn = 'PG:host={host} port={port} user={user} dbname={dbname}'.format(\n", + " host=os.environ['PGWEB_HOST'],\n", + " port=os.environ['PGWEB_PORT'],\n", + " user=os.environ['PGWEB_USER'],\n", + " dbname=DB_NAME,\n", + " )\n", + " env = os.environ.copy()\n", + " env['PGPASSWORD'] = os.environ['PGWEB_PASSWORD']\n", + " cmd = [\n", + " ogr2ogr_path,\n", + " '-f', 'PostgreSQL',\n", + " dsn,\n", + " str(spatial_path),\n", + " '-nln', f'public.{staging_table}',\n", + " '-overwrite',\n", + " '-t_srs', 'EPSG:4326',\n", + " '-nlt', 'PROMOTE_TO_MULTI',\n", + " '-dim', 'XY',\n", + " '-lco', 'GEOMETRY_NAME=geom',\n", + " ]\n", + " result = subprocess.run(cmd, env=env, capture_output=True, text=True)\n", + " if result.returncode != 0:\n", + " return {\n", + " 'layer_id': row['layer_id'],\n", + " 'status': 'ogr2ogr_failed',\n", + " 'features': 0,\n", + " 'error': (result.stderr or result.stdout)[-1000:],\n", + " }\n", + "\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('''\n", + " select column_name\n", + " from information_schema.columns\n", + " where table_schema = 'public' and table_name = %s\n", + " order by ordinal_position\n", + " ''', (staging_table,))\n", + " columns = [r[0] for r in cur.fetchall()]\n", + " geom_col = 'geom' if 'geom' in columns else 'wkb_geometry'\n", + " order_col = 'ogc_fid' if 'ogc_fid' in columns else 'ctid'\n", + "\n", + " metadata = {\n", + " 'columns': columns,\n", + " 'source_spatial_path': str(spatial_path),\n", + " 'loader': 'ogr2ogr',\n", + " }\n", + " cur.execute(sql.SQL('''\n", + " insert into {layers} (\n", + " layer_id, state_code, title, format, datasetid, source_url,\n", + " filename, local_path, spatial_path, metadata, loaded_at\n", + " ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now())\n", + " on conflict (layer_id) do update set\n", + " state_code = excluded.state_code,\n", + " title = excluded.title,\n", + " format = excluded.format,\n", + " datasetid = excluded.datasetid,\n", + " source_url = excluded.source_url,\n", + " filename = excluded.filename,\n", + " local_path = excluded.local_path,\n", + " spatial_path = excluded.spatial_path,\n", + " metadata = excluded.metadata,\n", + " loaded_at = now()\n", + " ''').format(layers=sql.SQL(LAYER_TABLE)), (\n", + " row['layer_id'], row['state_code'], row['title'], row['format'],\n", + " row.get('datasetid'), row['source_url'], row['filename'],\n", + " row['local_path'], spatial_path, Json(metadata)\n", + " ))\n", + " cur.execute(sql.SQL('delete from {} where layer_id = %s').format(sql.SQL(FEATURE_TABLE)), (row['layer_id'],))\n", + " insert_sql = sql.SQL('''\n", + " insert into {features} (feature_id, layer_id, state_code, source_row, properties, geom)\n", + " with src as (\n", + " select\n", + " row_number() over (order by {order_col})::integer as source_row,\n", + " to_jsonb(t) - %s - 'ogc_fid' - 'fid' as properties,\n", + " ST_Force2D({geom_col}) as geom\n", + " from {staging} t\n", + " where {geom_col} is not null\n", + " ), fixed as (\n", + " select\n", + " source_row,\n", + " properties,\n", + " (ST_Dump(ST_Multi(ST_CollectionExtract(ST_MakeValid(geom), 3)))).geom as geom\n", + " from src\n", + " )\n", + " select\n", + " %s || '/' || source_row::text,\n", + " %s,\n", + " %s,\n", + " source_row,\n", + " properties,\n", + " ST_Multi(ST_Force2D(ST_SetSRID(geom, 4326)))::geometry(MultiPolygon, 4326)\n", + " from fixed\n", + " where geom is not null and not ST_IsEmpty(geom)\n", + " ''').format(\n", + " features=sql.SQL(FEATURE_TABLE),\n", + " staging=sql.Identifier('public', staging_table),\n", + " geom_col=sql.Identifier(geom_col),\n", + " order_col=sql.SQL(order_col) if order_col == 'ctid' else sql.Identifier(order_col),\n", + " )\n", + " cur.execute(insert_sql, (geom_col, row['layer_id'], row['layer_id'], row['state_code']))\n", + " inserted = cur.rowcount\n", + " cur.execute(sql.SQL('drop table if exists {}').format(sql.Identifier('public', staging_table)))\n", + " cur.execute(sql.SQL('analyze {}').format(sql.SQL(FEATURE_TABLE)))\n", + " return {'layer_id': row['layer_id'], 'status': 'loaded_ogr2ogr', 'features': inserted}\n", + "\n", + "\n", + "def load_layer_to_postgis(row):\n", + " if not HAS_GEOPANDAS:\n", + " return load_layer_to_postgis_with_ogr(row)\n", + " spatial_path = row.get('spatial_path')\n", + " if not spatial_path or not Path(spatial_path).exists():\n", + " return {'layer_id': row['layer_id'], 'status': 'missing_spatial_file', 'features': 0}\n", + "\n", + " gdf = gpd.read_file(spatial_path)\n", + " if gdf.empty:\n", + " return {'layer_id': row['layer_id'], 'status': 'empty_file', 'features': 0}\n", + " if gdf.crs is None:\n", + " print(f'Warning: {spatial_path} has no CRS; assuming EPSG:4326.')\n", + " gdf = gdf.set_crs(4326)\n", + " else:\n", + " gdf = gdf.to_crs(4326)\n", + "\n", + " gdf = gdf[gdf.geometry.notna()].copy()\n", + " gdf = gdf[~gdf.geometry.is_empty].copy()\n", + " if gdf.empty:\n", + " return {'layer_id': row['layer_id'], 'status': 'no_valid_geometry', 'features': 0}\n", + "\n", + " layer_metadata = {\n", + " 'columns': [str(c) for c in gdf.columns],\n", + " 'row_count': int(len(gdf)),\n", + " 'source_spatial_path': spatial_path,\n", + " }\n", + "\n", + " feature_rows = []\n", + " prop_cols = [c for c in gdf.columns if c != gdf.geometry.name]\n", + " for source_row, (_, record) in enumerate(gdf.iterrows(), start=1):\n", + " geom = record.geometry\n", + " if geom is None or geom.is_empty:\n", + " continue\n", + " properties = {str(col): json_safe(record[col]) for col in prop_cols}\n", + " feature_rows.append((\n", + " stable_feature_id(row['layer_id'], source_row, geom),\n", + " row['layer_id'],\n", + " row['state_code'],\n", + " source_row,\n", + " Json(properties),\n", + " json.dumps(geom.__geo_interface__),\n", + " ))\n", + "\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(sql.SQL('''\n", + " insert into {layers} (\n", + " layer_id, state_code, title, format, datasetid, source_url,\n", + " filename, local_path, spatial_path, metadata, loaded_at\n", + " ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now())\n", + " on conflict (layer_id) do update set\n", + " state_code = excluded.state_code,\n", + " title = excluded.title,\n", + " format = excluded.format,\n", + " datasetid = excluded.datasetid,\n", + " source_url = excluded.source_url,\n", + " filename = excluded.filename,\n", + " local_path = excluded.local_path,\n", + " spatial_path = excluded.spatial_path,\n", + " metadata = excluded.metadata,\n", + " loaded_at = now()\n", + " ''').format(layers=sql.SQL(LAYER_TABLE)), (\n", + " row['layer_id'], row['state_code'], row['title'], row['format'],\n", + " row.get('datasetid'), row['source_url'], row['filename'],\n", + " row['local_path'], spatial_path, Json(layer_metadata)\n", + " ))\n", + " cur.execute(sql.SQL('delete from {} where layer_id = %s').format(sql.SQL(FEATURE_TABLE)), (row['layer_id'],))\n", + " if feature_rows:\n", + " execute_values(\n", + " cur,\n", + " sql.SQL('''\n", + " insert into {features} (\n", + " feature_id, layer_id, state_code, source_row, properties, geom\n", + " ) values %s\n", + " ''').format(features=sql.SQL(FEATURE_TABLE)).as_string(conn),\n", + " feature_rows,\n", + " template='(%s, %s, %s, %s, %s, ST_Multi(ST_CollectionExtract(ST_MakeValid(ST_Force2D(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326))), 3)))',\n", + " page_size=1000,\n", + " )\n", + " cur.execute(sql.SQL('analyze {}').format(sql.SQL(FEATURE_TABLE)))\n", + " return {'layer_id': row['layer_id'], 'status': 'loaded', 'features': len(feature_rows)}\n", + "\n", + "\n", + "SKIP_ALREADY_LOADED_LAYERS = True\n", + "\n", + "load_results = []\n", + "if LOAD_TO_POSTGIS:\n", + " found = spatial_manifest[spatial_manifest['spatial_status'].eq('found')].copy()\n", + " already_loaded = set()\n", + " if SKIP_ALREADY_LOADED_LAYERS:\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(f'select layer_id from {LAYER_TABLE}')\n", + " already_loaded = {r[0] for r in cur.fetchall()}\n", + " print(f'Skipping {len(already_loaded)} layer(s) already in PostGIS')\n", + " for row in found.to_dict('records'):\n", + " if row['layer_id'] in already_loaded:\n", + " load_results.append({'layer_id': row['layer_id'], 'status': 'already_loaded', 'features': None})\n", + " continue\n", + " print('Loading layer:', row['state_code'], row['title'])\n", + " load_results.append(load_layer_to_postgis(row))\n", + "else:\n", + " print('LOAD_TO_POSTGIS=False; skipping spatial file load.')\n", + "\n", + "load_results = pd.DataFrame(load_results)\n", + "display(load_results)\n" + ] + }, + { + "cell_type": "markdown", + "id": "19", + "metadata": {}, + "source": [ + "## Match Data Centers to Precinct Layers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "MATCH_SQL = f'''\n", + "insert into {MATCH_TABLE} (\n", + " master_id, feature_id, layer_id, state_code, join_method, match_distance_m, matched_at\n", + ")\n", + "select\n", + " m.master_id,\n", + " f.feature_id,\n", + " f.layer_id,\n", + " f.state_code,\n", + " 'point_in_precinct' as join_method,\n", + " 0.0 as match_distance_m,\n", + " now()\n", + "from {MASTER_TABLE} m\n", + "join {FEATURE_TABLE} f\n", + " on m.geom && f.geom\n", + " and ST_Covers(f.geom, m.geom)\n", + "where m.geom is not null\n", + "on conflict (master_id, feature_id) do update set\n", + " join_method = excluded.join_method,\n", + " match_distance_m = excluded.match_distance_m,\n", + " matched_at = now()\n", + "'''\n", + "\n", + "if RUN_SPATIAL_MATCH:\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(MATCH_SQL)\n", + " print('Inserted/updated matches:', cur.rowcount)\n", + " cur.execute(sql.SQL('analyze {}').format(sql.SQL(MATCH_TABLE)))\n", + "else:\n", + " print('RUN_SPATIAL_MATCH=False; skipping point-in-precinct matching.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": {}, + "source": [ + "## Match Diagnostics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": {}, + "outputs": [], + "source": [ + "diagnostics_sql = f'''\n", + "with dc as (\n", + " select state, count(*) as data_centers, ST_Extent(geom) as dc_extent\n", + " from {MASTER_TABLE}\n", + " where geom is not null\n", + " group by state\n", + "), feats as (\n", + " select state_code as state, count(*) as precinct_features, ST_Extent(geom) as precinct_extent\n", + " from {FEATURE_TABLE}\n", + " where geom is not null\n", + " group by state_code\n", + "), matches as (\n", + " select state_code as state, count(distinct master_id) as matched_data_centers\n", + " from {MATCH_TABLE}\n", + " group by state_code\n", + ")\n", + "select\n", + " coalesce(dc.state, feats.state, matches.state) as state,\n", + " coalesce(dc.data_centers, 0) as data_centers,\n", + " coalesce(feats.precinct_features, 0) as precinct_features,\n", + " coalesce(matches.matched_data_centers, 0) as matched_data_centers,\n", + " dc.dc_extent::text as data_center_extent,\n", + " feats.precinct_extent::text as precinct_extent\n", + "from dc\n", + "full join feats on feats.state = dc.state\n", + "full join matches on matches.state = coalesce(dc.state, feats.state)\n", + "order by state\n", + "'''\n", + "\n", + "nearest_sql = f'''\n", + "select\n", + " m.master_id,\n", + " m.name,\n", + " m.city,\n", + " m.state,\n", + " nearest.layer_id,\n", + " nearest.feature_id,\n", + " nearest.distance_m\n", + "from {MASTER_TABLE} m\n", + "left join {MATCH_TABLE} matched on matched.master_id = m.master_id\n", + "left join lateral (\n", + " select\n", + " f.layer_id,\n", + " f.feature_id,\n", + " ST_Distance(m.geom::geography, f.geom::geography) as distance_m\n", + " from {FEATURE_TABLE} f\n", + " where f.geom is not null\n", + " order by m.geom <-> f.geom\n", + " limit 1\n", + ") nearest on true\n", + "where m.geom is not null\n", + " and matched.master_id is null\n", + "order by nearest.distance_m nulls last\n", + "limit 50\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " diagnostics = pd.read_sql_query(diagnostics_sql, conn)\n", + " nearest_unmatched = pd.read_sql_query(nearest_sql, conn)\n", + "\n", + "display(diagnostics)\n", + "display(nearest_unmatched)\n", + "\n", + "if diagnostics['precinct_features'].sum() == 0:\n", + " print('PostGIS has zero RDH precinct features loaded. Run the download, unpack, and load cells before matching.')\n", + "elif diagnostics['matched_data_centers'].sum() == 0:\n", + " print('Precinct features exist, but no point-in-polygon matches were found. Check the extents and nearest unmatched distances above.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "23", + "metadata": {}, + "source": [ + "## Optional Nearest-Precinct Fallback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24", + "metadata": {}, + "outputs": [], + "source": [ + "NEAREST_FALLBACK_SQL = f'''\n", + "insert into {MATCH_TABLE} (\n", + " master_id, feature_id, layer_id, state_code, join_method, match_distance_m, matched_at\n", + ")\n", + "with candidates as (\n", + " select m.master_id, m.geom, upper(m.state) as state_code\n", + " from {MASTER_TABLE} m\n", + " where m.geom is not null\n", + " and m.state is not null\n", + " and upper(m.state) in (select distinct state_code from {FEATURE_TABLE} where geom is not null)\n", + " and not exists (\n", + " select 1 from {MATCH_TABLE} existing where existing.master_id = m.master_id\n", + " )\n", + ")\n", + "select\n", + " c.master_id,\n", + " nearest.feature_id,\n", + " nearest.layer_id,\n", + " nearest.state_code,\n", + " 'nearest_precinct_within_threshold' as join_method,\n", + " nearest.distance_m,\n", + " now()\n", + "from candidates c\n", + "join lateral (\n", + " select\n", + " f.feature_id,\n", + " f.layer_id,\n", + " f.state_code,\n", + " ST_Distance(c.geom::geography, f.geom::geography) as distance_m\n", + " from {FEATURE_TABLE} f\n", + " where f.geom is not null\n", + " and f.state_code = c.state_code\n", + " order by c.geom <-> f.geom\n", + " limit 1\n", + ") nearest on true\n", + "where nearest.distance_m <= %s\n", + "on conflict (master_id, feature_id) do update set\n", + " join_method = excluded.join_method,\n", + " match_distance_m = excluded.match_distance_m,\n", + " matched_at = now()\n", + "'''\n", + "\n", + "if RUN_SPATIAL_MATCH and RUN_NEAREST_PRECINCT_FALLBACK:\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(NEAREST_FALLBACK_SQL, (NEAREST_PRECINCT_MAX_DISTANCE_M,))\n", + " print('Inserted/updated nearest fallback matches:', cur.rowcount)\n", + " cur.execute(sql.SQL('analyze {}').format(sql.SQL(MATCH_TABLE)))\n", + "else:\n", + " print('Nearest fallback disabled.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "25", + "metadata": {}, + "source": [ + "## Inspect Matches and Census-Tract Context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26", + "metadata": {}, + "outputs": [], + "source": [ + "match_summary_sql = f'''\n", + "select\n", + " l.state_code,\n", + " l.title,\n", + " count(distinct f.feature_id) as precinct_features,\n", + " count(distinct m.master_id) as matched_data_centers\n", + "from {LAYER_TABLE} l\n", + "left join {FEATURE_TABLE} f on f.layer_id = l.layer_id\n", + "left join {MATCH_TABLE} m on m.feature_id = f.feature_id\n", + "group by l.state_code, l.title\n", + "order by l.state_code, matched_data_centers desc, l.title\n", + "'''\n", + "\n", + "sample_sql = f'''\n", + "select\n", + " m.master_id,\n", + " dc.name,\n", + " dc.city,\n", + " dc.state,\n", + " l.title as rdh_layer_title,\n", + " f.properties as precinct_properties\n", + "from {MATCH_TABLE} m\n", + "join {MASTER_TABLE} dc on dc.master_id = m.master_id\n", + "join {FEATURE_TABLE} f on f.feature_id = m.feature_id\n", + "join {LAYER_TABLE} l on l.layer_id = m.layer_id\n", + "order by dc.state, dc.city, dc.name\n", + "limit 25\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " match_summary = pd.read_sql_query(match_summary_sql, conn)\n", + " sample_matches = pd.read_sql_query(sample_sql, conn)\n", + "\n", + "display(match_summary)\n", + "display(sample_matches)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": {}, + "outputs": [], + "source": [ + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('select to_regclass(%s)', (CENSUS_TRACT_TABLE,))\n", + " has_census_tract_table = cur.fetchone()[0] is not None\n", + "\n", + "if has_census_tract_table:\n", + " tract_context_sql = f'''\n", + " select\n", + " dc.master_id,\n", + " dc.name,\n", + " dc.city,\n", + " dc.state,\n", + " dc.geoid as census_tract_geoid,\n", + " count(pm.feature_id) as precinct_layer_matches\n", + " from {MASTER_TABLE} dc\n", + " left join {CENSUS_TRACT_TABLE} ct on ct.geoid = dc.geoid\n", + " left join {MATCH_TABLE} pm on pm.master_id = dc.master_id\n", + " group by dc.master_id, dc.name, dc.city, dc.state, ct.geoid\n", + " order by precinct_layer_matches asc, dc.state, dc.city\n", + " limit 50\n", + " '''\n", + " with get_conn() as conn:\n", + " tract_context = pd.read_sql_query(tract_context_sql, conn)\n", + " display(tract_context)\n", + "else:\n", + " print(f'{CENSUS_TRACT_TABLE} not found; census tract fallback/context skipped.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "28", + "metadata": {}, + "source": [ + "## Next Refinement: Tidy Vote Columns\n", + "\n", + "The RDH staging table intentionally stores each precinct row's original attributes in `properties jsonb`. Once the downloaded layers are visible, inspect `precinct_properties` above to identify vote-column patterns for the states/years you care about.\n", + "\n", + "Useful follow-up views can then extract fields like:\n", + "- precinct identifier/name\n", + "- election year\n", + "- office\n", + "- Democratic votes\n", + "- Republican votes\n", + "- total votes\n", + "- turnout or vote share\n", + "\n", + "That extraction is best added after confirming the specific RDH file families selected by the filters.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}