From 8a1a0b9aff0eb48424429a4cbb7057b39566cad2 Mon Sep 17 00:00:00 2001 From: dadams Date: Tue, 19 May 2026 22:28:23 -0700 Subject: [PATCH] open meteao data --- open_meteo_historical_data_centers.ipynb | 753 +++++++++++++++++++++++ 1 file changed, 753 insertions(+) create mode 100644 open_meteo_historical_data_centers.ipynb diff --git a/open_meteo_historical_data_centers.ipynb b/open_meteo_historical_data_centers.ipynb new file mode 100644 index 0000000..310915b --- /dev/null +++ b/open_meteo_historical_data_centers.ipynb @@ -0,0 +1,753 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Open-Meteo Historical Climate for Master Data Centers\n", + "\n", + "This notebook builds or refreshes `public.data_center_open_meteo_historical_climate`, a one-row-per-data-center climate summary table keyed by `master_id`.\n", + "\n", + "Design goals:\n", + "- Joinable to `public.master_data_centers` through `master_id`.\n", + "- Rerunnable as new data centers are added to the master table.\n", + "- Uses the same `.env` and `PGWEB_*` connection pattern as `spatial_clustering_master_data_centers.ipynb`.\n", + "- Fetches metrics available from the Open-Meteo Historical Weather API now, and leaves explicit placeholders for NOAA/FEMA/smoke additions later.\n", + "\n", + "Open-Meteo coverage in this notebook:\n", + "\n", + "| Requested variable | Notebook field(s) | Source status |\n", + "| --- | --- | --- |\n", + "| Mean annual temperature | `mean_annual_temperature_c` | Open-Meteo |\n", + "| Mean summer temperature | `mean_summer_temperature_c` | Open-Meteo |\n", + "| Wet bulb temperature | `mean_wet_bulb_temperature_c`, `max_wet_bulb_temperature_c`, `extreme_wet_bulb_days` | Open-Meteo |\n", + "| Relative humidity | `mean_relative_humidity_pct` | Open-Meteo |\n", + "| Cooling degree days | `cooling_degree_days_c`, `annual_cooling_degree_days_c_mean` | Derived from Open-Meteo |\n", + "| Extreme heat days | `extreme_heat_days`, `annual_extreme_heat_days_mean` | Derived from Open-Meteo |\n", + "| Diurnal temperature range | `mean_diurnal_temperature_range_c` | Derived from Open-Meteo |\n", + "| Precipitation variability | `precipitation_total_mm`, `annual_precipitation_cv`, `wet_day_precipitation_p95_mm` | Derived from Open-Meteo |\n", + "| Drought severity index | `drought_severity_index_source_status` | External source needed |\n", + "| Windstorm frequency | `windstorm_days`, `annual_windstorm_days_mean` | Open-Meteo proxy using wind gust threshold |\n", + "| Wildfire smoke exposure | `wildfire_smoke_exposure_source_status` | External source needed |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import time\n", + "import urllib.error\n", + "import urllib.parse\n", + "import urllib.request\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import psycopg2\n", + "from psycopg2 import sql\n", + "from psycopg2.extras import execute_values\n", + "\n", + "pd.set_option('display.max_columns', 100)\n", + "pd.set_option('display.max_rows', 120)\n", + "\n", + "print('pandas:', pd.__version__)\n", + "print('numpy:', np.__version__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "def load_env_file(env_path: str = '.env') -> None:\n", + " p = Path(env_path)\n", + " if not p.exists():\n", + " print(f'No {env_path} file found in {Path.cwd()}')\n", + " return\n", + "\n", + " loaded = 0\n", + " for raw_line in p.read_text(encoding='utf-8').splitlines():\n", + " line = raw_line.strip()\n", + " if not line or line.startswith('#') or '=' not in line:\n", + " continue\n", + " key, value = line.split('=', 1)\n", + " key = key.strip()\n", + " value = value.strip().strip('\"').strip(\"'\")\n", + " if key and key not in os.environ:\n", + " os.environ[key] = value\n", + " loaded += 1\n", + " print(f'Loaded {loaded} env var(s) from {env_path}')\n", + "\n", + "\n", + "def require_env(keys):\n", + " missing = [k for k in keys if not os.getenv(k)]\n", + " if missing:\n", + " raise EnvironmentError(\n", + " 'Missing required env vars in notebook kernel: ' + ', '.join(missing) +\n", + " '.\\nSet them in this notebook, or add them to a .env file in this folder.'\n", + " )\n", + "\n", + "\n", + "load_env_file('.env')\n", + "required_keys = ['PGWEB_HOST', 'PGWEB_PORT', 'PGWEB_USER', 'PGWEB_PASSWORD']\n", + "require_env(required_keys)\n", + "\n", + "DB_NAME = os.getenv('PGDATABASE', 'data_centers')\n", + "MASTER_TABLE = 'public.master_data_centers'\n", + "CLIMATE_TABLE = 'public.data_center_open_meteo_historical_climate'\n", + "\n", + "\n", + "def get_conn():\n", + " return psycopg2.connect(\n", + " host=os.environ['PGWEB_HOST'],\n", + " port=os.environ['PGWEB_PORT'],\n", + " user=os.environ['PGWEB_USER'],\n", + " password=os.environ['PGWEB_PASSWORD'],\n", + " dbname=\"data_centers\",\n", + " )\n", + "\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute('select current_database(), current_user')\n", + " db, usr = cur.fetchone()\n", + " print('Connected to DB:', db)\n", + " print('As user:', usr)\n", + " cur.execute('create extension if not exists postgis')\n", + " cur.execute('select to_regclass(%s)', (MASTER_TABLE,))\n", + " if cur.fetchone()[0] is None:\n", + " raise RuntimeError(f'{MASTER_TABLE} does not exist. Run build_master_data_centers.py first.')\n", + " cur.execute(sql.SQL('select count(*) from {}').format(sql.SQL(MASTER_TABLE)))\n", + " print(f'{MASTER_TABLE} rows:', f'{cur.fetchone()[0]:,}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## Parameters\n", + "\n", + "The default climate normal period is 1991-2020. Open-Meteo can go farther back, but this period is a standard baseline and keeps each API response moderate.\n", + "\n", + "Set `REFRESH_EXISTING = True` when you want to recompute existing rows after changing thresholds, variables, dates, or source model. Leave it `False` to only pick up new `master_id` values added since the last run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "OPEN_METEO_ARCHIVE_URL = 'https://archive-api.open-meteo.com/v1/archive'\n", + "OPEN_METEO_MODEL = 'era5'\n", + "\n", + "START_DATE = '1991-01-01'\n", + "END_DATE = '2020-12-31'\n", + "TIMEZONE = 'UTC'\n", + "\n", + "REFRESH_EXISTING = False\n", + "DRY_RUN = False\n", + "MAX_POINTS = None # use a small integer for testing, or None for all eligible data centers\n", + "BATCH_SIZE = 1 # Public Open-Meteo archive limits are tight for 30-year daily histories\n", + "REQUEST_SLEEP_SECONDS = 70.0\n", + "REQUEST_TIMEOUT_SECONDS = 90\n", + "MAX_RETRIES = 8\n", + "RATE_LIMIT_SLEEP_SECONDS = 120\n", + "\n", + "CDD_BASE_C = 18.3\n", + "EXTREME_HEAT_THRESHOLD_C = 35.0\n", + "EXTREME_WET_BULB_THRESHOLD_C = 28.0\n", + "WINDSTORM_GUST_THRESHOLD_MS = 17.2 # roughly 38.5 mph; adjust for your risk definition\n", + "WET_DAY_THRESHOLD_MM = 1.0\n", + "\n", + "DAILY_VARIABLES = [\n", + " 'temperature_2m_mean',\n", + " 'temperature_2m_max',\n", + " 'temperature_2m_min',\n", + " 'relative_humidity_2m_mean',\n", + " 'wet_bulb_temperature_2m_mean',\n", + " 'wet_bulb_temperature_2m_max',\n", + " 'precipitation_sum',\n", + " 'wind_gusts_10m_max',\n", + " 'soil_moisture_0_to_100cm_mean',\n", + "]\n", + "\n", + "print(f'Period: {START_DATE} to {END_DATE}')\n", + "print(f'Target table: {CLIMATE_TABLE}')\n", + "print('Refresh existing rows:', REFRESH_EXISTING)\n" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "## Create Target Table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "CREATE_CLIMATE_TABLE_SQL = f'''\n", + "create table if not exists {CLIMATE_TABLE} (\n", + " master_id text primary key references public.master_data_centers(master_id) on delete cascade,\n", + " source text,\n", + " name text,\n", + " operator text,\n", + " city text,\n", + " state text,\n", + " country text,\n", + " longitude double precision not null,\n", + " latitude double precision not null,\n", + " geom geometry(Point, 4326),\n", + "\n", + " open_meteo_model text not null,\n", + " climate_period_start date not null,\n", + " climate_period_end date not null,\n", + " api_timezone text not null,\n", + " api_grid_latitude double precision,\n", + " api_grid_longitude double precision,\n", + " api_elevation_m double precision,\n", + " api_utc_offset_seconds integer,\n", + " observation_days integer not null,\n", + " observation_years integer not null,\n", + " api_generationtime_ms double precision,\n", + "\n", + " mean_annual_temperature_c double precision,\n", + " mean_summer_temperature_c double precision,\n", + " max_daily_temperature_c double precision,\n", + " mean_wet_bulb_temperature_c double precision,\n", + " max_wet_bulb_temperature_c double precision,\n", + " mean_relative_humidity_pct double precision,\n", + " cooling_degree_days_c double precision,\n", + " annual_cooling_degree_days_c_mean double precision,\n", + " extreme_heat_days integer,\n", + " annual_extreme_heat_days_mean double precision,\n", + " extreme_wet_bulb_days integer,\n", + " mean_diurnal_temperature_range_c double precision,\n", + " precipitation_total_mm double precision,\n", + " annual_precipitation_mm_mean double precision,\n", + " annual_precipitation_cv double precision,\n", + " wet_day_precipitation_p95_mm double precision,\n", + " windstorm_days integer,\n", + " annual_windstorm_days_mean double precision,\n", + " max_wind_gust_ms double precision,\n", + " mean_soil_moisture_0_to_100cm double precision,\n", + "\n", + " drought_severity_index_source_status text not null default 'needs_external_source',\n", + " wildfire_smoke_exposure_source_status text not null default 'needs_external_source',\n", + " open_meteo_daily_variables text[] not null,\n", + " open_meteo_url text,\n", + " fetched_at timestamptz not null default now(),\n", + " updated_at timestamptz not null default now()\n", + ");\n", + "\n", + "create index if not exists data_center_open_meteo_hist_geom_gix\n", + " on {CLIMATE_TABLE} using gist (geom);\n", + "create index if not exists data_center_open_meteo_hist_state_idx\n", + " on {CLIMATE_TABLE} (state);\n", + "create index if not exists data_center_open_meteo_hist_period_idx\n", + " on {CLIMATE_TABLE} (climate_period_start, climate_period_end);\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(CREATE_CLIMATE_TABLE_SQL)\n", + " cur.execute(sql.SQL('select count(*) from {}').format(sql.SQL(CLIMATE_TABLE)))\n", + " existing = cur.fetchone()[0]\n", + " print(f'{CLIMATE_TABLE} currently has {existing:,} row(s)')\n" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "## Load Data Centers Needing Climate Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "def load_target_points(refresh_existing=False, max_points=None):\n", + " exists_filter = ''\n", + " if not refresh_existing:\n", + " exists_filter = f'''\n", + " and not exists (\n", + " select 1\n", + " from {CLIMATE_TABLE} c\n", + " where c.master_id = m.master_id\n", + " and c.open_meteo_model = %s\n", + " and c.climate_period_start = %s::date\n", + " and c.climate_period_end = %s::date\n", + " )\n", + " '''\n", + " params = [OPEN_METEO_MODEL, START_DATE, END_DATE]\n", + " else:\n", + " params = []\n", + "\n", + " limit_sql = ''\n", + " if max_points is not None:\n", + " limit_sql = 'limit %s'\n", + " params.append(int(max_points))\n", + "\n", + " query = f'''\n", + " select\n", + " m.master_id,\n", + " m.source,\n", + " m.name,\n", + " m.operator,\n", + " m.city,\n", + " m.state,\n", + " m.country,\n", + " m.longitude,\n", + " m.latitude,\n", + " ST_AsText(m.geom) as geom_wkt\n", + " from {MASTER_TABLE} m\n", + " where m.longitude is not null\n", + " and m.latitude is not null\n", + " and m.geom is not null\n", + " {exists_filter}\n", + " order by m.master_id\n", + " {limit_sql}\n", + " '''\n", + " with get_conn() as conn:\n", + " return pd.read_sql_query(query, conn, params=params)\n", + "\n", + "\n", + "targets = load_target_points(refresh_existing=REFRESH_EXISTING, max_points=MAX_POINTS)\n", + "print(f'Target data centers to fetch: {len(targets):,}')\n", + "display(targets.head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Fetch and Summarize Open-Meteo Daily Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "def format_coordinate_param(values):\n", + " if np.isscalar(values):\n", + " return f'{float(values):.6f}'\n", + " return ','.join(f'{float(value):.6f}' for value in values)\n", + "\n", + "\n", + "def build_open_meteo_url(latitude, longitude):\n", + " params = {\n", + " 'latitude': format_coordinate_param(latitude),\n", + " 'longitude': format_coordinate_param(longitude),\n", + " 'start_date': START_DATE,\n", + " 'end_date': END_DATE,\n", + " 'daily': ','.join(DAILY_VARIABLES),\n", + " 'temperature_unit': 'celsius',\n", + " 'wind_speed_unit': 'ms',\n", + " 'precipitation_unit': 'mm',\n", + " 'timezone': TIMEZONE,\n", + " 'models': OPEN_METEO_MODEL,\n", + " }\n", + " return OPEN_METEO_ARCHIVE_URL + '?' + urllib.parse.urlencode(params, safe=',')\n", + "\n", + "\n", + "def retry_after_seconds(exc):\n", + " value = exc.headers.get('Retry-After') if getattr(exc, 'headers', None) else None\n", + " if value:\n", + " try:\n", + " return max(1, int(value))\n", + " except ValueError:\n", + " pass\n", + " return None\n", + "\n", + "\n", + "def fetch_json(url):\n", + " last_error = None\n", + " for attempt in range(1, MAX_RETRIES + 1):\n", + " try:\n", + " request = urllib.request.Request(url, headers={'User-Agent': 'data-center-climate-notebook/1.0'})\n", + " with urllib.request.urlopen(request, timeout=REQUEST_TIMEOUT_SECONDS) as response:\n", + " return json.loads(response.read().decode('utf-8'))\n", + " except urllib.error.HTTPError as exc:\n", + " last_error = exc\n", + " try:\n", + " error_body = exc.read().decode('utf-8')\n", + " except Exception:\n", + " error_body = ''\n", + " if exc.code == 400:\n", + " raise RuntimeError(f'HTTP 400 Bad Request: {error_body or exc.reason}')\n", + " if exc.code == 429:\n", + " if 'hourly api request limit exceeded' in error_body.lower():\n", + " raise RuntimeError(f'HTTP 429 Hourly API limit: {error_body}')\n", + " sleep_for = retry_after_seconds(exc) or min(RATE_LIMIT_SLEEP_SECONDS * attempt, 900)\n", + " detail = f': {error_body}' if error_body else ''\n", + " print(f'Open-Meteo rate limit on attempt {attempt}/{MAX_RETRIES}{detail}; sleeping {sleep_for}s')\n", + " else:\n", + " sleep_for = min(2 ** attempt, 60)\n", + " print(f'HTTP error on attempt {attempt}/{MAX_RETRIES}: {exc}; sleeping {sleep_for}s')\n", + " time.sleep(sleep_for)\n", + " except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:\n", + " last_error = exc\n", + " sleep_for = min(2 ** attempt, 60)\n", + " print(f'Fetch failed on attempt {attempt}/{MAX_RETRIES}: {exc}; sleeping {sleep_for}s')\n", + " time.sleep(sleep_for)\n", + " raise RuntimeError(f'Open-Meteo request failed after {MAX_RETRIES} attempt(s): {last_error}')\n", + "\n", + "\n", + "def summarize_daily_weather(payload, dc_row, request_url):\n", + " daily = payload.get('daily', {})\n", + " if 'time' not in daily:\n", + " reason = payload.get('reason') or 'Open-Meteo response did not include daily.time'\n", + " raise RuntimeError(reason)\n", + "\n", + " weather = pd.DataFrame(daily)\n", + " weather['time'] = pd.to_datetime(weather['time'])\n", + " weather['year'] = weather['time'].dt.year\n", + " weather['month'] = weather['time'].dt.month\n", + "\n", + " numeric_cols = [c for c in weather.columns if c not in {'time', 'year', 'month'}]\n", + " for col in numeric_cols:\n", + " weather[col] = pd.to_numeric(weather[col], errors='coerce')\n", + "\n", + " weather['cooling_degree_day_c'] = (weather['temperature_2m_mean'] - CDD_BASE_C).clip(lower=0)\n", + " weather['extreme_heat_day'] = weather['temperature_2m_max'] >= EXTREME_HEAT_THRESHOLD_C\n", + " weather['extreme_wet_bulb_day'] = weather['wet_bulb_temperature_2m_max'] >= EXTREME_WET_BULB_THRESHOLD_C\n", + " weather['diurnal_temperature_range_c'] = weather['temperature_2m_max'] - weather['temperature_2m_min']\n", + " weather['windstorm_day'] = weather['wind_gusts_10m_max'] >= WINDSTORM_GUST_THRESHOLD_MS\n", + " wet_days = weather.loc[weather['precipitation_sum'] >= WET_DAY_THRESHOLD_MM, 'precipitation_sum']\n", + "\n", + " annual = (\n", + " weather.groupby('year', as_index=False)\n", + " .agg(\n", + " annual_mean_temperature_c=('temperature_2m_mean', 'mean'),\n", + " annual_cooling_degree_days_c=('cooling_degree_day_c', 'sum'),\n", + " annual_extreme_heat_days=('extreme_heat_day', 'sum'),\n", + " annual_precipitation_mm=('precipitation_sum', 'sum'),\n", + " annual_windstorm_days=('windstorm_day', 'sum'),\n", + " )\n", + " )\n", + " summer = weather.loc[weather['month'].isin([6, 7, 8])]\n", + "\n", + " annual_precip = annual['annual_precipitation_mm'].dropna()\n", + " precip_cv = np.nan\n", + " if len(annual_precip) > 1 and annual_precip.mean() != 0:\n", + " precip_cv = float(annual_precip.std(ddof=1) / annual_precip.mean())\n", + "\n", + " return {\n", + " 'master_id': dc_row.master_id,\n", + " 'source': dc_row.source,\n", + " 'name': dc_row.name,\n", + " 'operator': dc_row.operator,\n", + " 'city': dc_row.city,\n", + " 'state': dc_row.state,\n", + " 'country': dc_row.country,\n", + " 'longitude': float(dc_row.longitude),\n", + " 'latitude': float(dc_row.latitude),\n", + " 'open_meteo_model': OPEN_METEO_MODEL,\n", + " 'climate_period_start': START_DATE,\n", + " 'climate_period_end': END_DATE,\n", + " 'api_timezone': payload.get('timezone', TIMEZONE),\n", + " 'api_grid_latitude': payload.get('latitude'),\n", + " 'api_grid_longitude': payload.get('longitude'),\n", + " 'api_elevation_m': payload.get('elevation'),\n", + " 'api_utc_offset_seconds': payload.get('utc_offset_seconds'),\n", + " 'observation_days': int(len(weather)),\n", + " 'observation_years': int(weather['year'].nunique()),\n", + " 'api_generationtime_ms': payload.get('generationtime_ms'),\n", + " 'mean_annual_temperature_c': float(annual['annual_mean_temperature_c'].mean()),\n", + " 'mean_summer_temperature_c': float(summer['temperature_2m_mean'].mean()) if len(summer) else np.nan,\n", + " 'max_daily_temperature_c': float(weather['temperature_2m_max'].max()),\n", + " 'mean_wet_bulb_temperature_c': float(weather['wet_bulb_temperature_2m_mean'].mean()),\n", + " 'max_wet_bulb_temperature_c': float(weather['wet_bulb_temperature_2m_max'].max()),\n", + " 'mean_relative_humidity_pct': float(weather['relative_humidity_2m_mean'].mean()),\n", + " 'cooling_degree_days_c': float(weather['cooling_degree_day_c'].sum()),\n", + " 'annual_cooling_degree_days_c_mean': float(annual['annual_cooling_degree_days_c'].mean()),\n", + " 'extreme_heat_days': int(weather['extreme_heat_day'].sum()),\n", + " 'annual_extreme_heat_days_mean': float(annual['annual_extreme_heat_days'].mean()),\n", + " 'extreme_wet_bulb_days': int(weather['extreme_wet_bulb_day'].sum()),\n", + " 'mean_diurnal_temperature_range_c': float(weather['diurnal_temperature_range_c'].mean()),\n", + " 'precipitation_total_mm': float(weather['precipitation_sum'].sum()),\n", + " 'annual_precipitation_mm_mean': float(annual['annual_precipitation_mm'].mean()),\n", + " 'annual_precipitation_cv': precip_cv,\n", + " 'wet_day_precipitation_p95_mm': float(wet_days.quantile(0.95)) if len(wet_days) else 0.0,\n", + " 'windstorm_days': int(weather['windstorm_day'].sum()),\n", + " 'annual_windstorm_days_mean': float(annual['annual_windstorm_days'].mean()),\n", + " 'max_wind_gust_ms': float(weather['wind_gusts_10m_max'].max()),\n", + " 'mean_soil_moisture_0_to_100cm': float(weather['soil_moisture_0_to_100cm_mean'].mean()),\n", + " 'drought_severity_index_source_status': 'needs_external_source',\n", + " 'wildfire_smoke_exposure_source_status': 'needs_external_source',\n", + " 'open_meteo_daily_variables': DAILY_VARIABLES,\n", + " 'open_meteo_url': request_url,\n", + " }\n", + "\n", + "\n", + "def is_request_too_large_error(exc):\n", + " text = str(exc).lower()\n", + " return 'http 400' in text and 'requests too much data' in text\n", + "\n", + "\n", + "def fetch_batch_summaries(batch):\n", + " url = build_open_meteo_url(batch['latitude'].to_list(), batch['longitude'].to_list())\n", + " try:\n", + " payload = fetch_json(url)\n", + " payloads = payload if isinstance(payload, list) else [payload]\n", + " if len(payloads) != len(batch):\n", + " raise RuntimeError(f'Open-Meteo returned {len(payloads)} payload(s) for {len(batch)} requested location(s)')\n", + " rows = [\n", + " summarize_daily_weather(location_payload, dc_row, url)\n", + " for dc_row, location_payload in zip(batch.itertuples(index=False), payloads)\n", + " ]\n", + " return rows, []\n", + " except Exception as exc:\n", + " if is_request_too_large_error(exc) and len(batch) > 1:\n", + " midpoint = len(batch) // 2\n", + " print(f'Batch of {len(batch)} locations is too large; splitting into {midpoint} + {len(batch) - midpoint}')\n", + " left_rows, left_errors = fetch_batch_summaries(batch.iloc[:midpoint].copy())\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + " right_rows, right_errors = fetch_batch_summaries(batch.iloc[midpoint:].copy())\n", + " return left_rows + right_rows, left_errors + right_errors\n", + " batch_errors = [\n", + " {'master_id': dc_row.master_id, 'error': str(exc), 'url': url}\n", + " for dc_row in batch.itertuples(index=False)\n", + " ]\n", + " return [], batch_errors\n", + "\n", + "\n", + "if targets.empty:\n", + " climate_rows = []\n", + " climate = pd.DataFrame()\n", + " print('No new target rows. Set REFRESH_EXISTING=True to recompute existing rows.')\n", + "else:\n", + " climate_rows = []\n", + " errors = []\n", + " total = len(targets)\n", + " checkpoint_path = Path('output/open_meteo_historical_climate_checkpoint.csv')\n", + " checkpoint_path.parent.mkdir(exist_ok=True)\n", + "\n", + " for batch_start in range(0, total, BATCH_SIZE):\n", + " batch = targets.iloc[batch_start:batch_start + BATCH_SIZE].copy()\n", + " batch_end = batch_start + len(batch)\n", + " batch_rows, batch_errors = fetch_batch_summaries(batch)\n", + " climate_rows.extend(batch_rows)\n", + " errors.extend(batch_errors)\n", + " if batch_errors:\n", + " print(f'ERROR batch {batch_start + 1:,}-{batch_end:,}: {batch_errors[0][\"error\"]}')\n", + " if '429' in batch_errors[0]['error'] or 'Too Many Requests' in batch_errors[0]['error'] or 'Hourly API limit' in batch_errors[0]['error']:\n", + " print('Open-Meteo rate limit reached; stopping this run so successes can be saved before retrying later.')\n", + " break\n", + "\n", + " if climate_rows:\n", + " pd.DataFrame(climate_rows).to_csv(checkpoint_path, index=False)\n", + " print(f'Checkpointed {len(climate_rows):,} row(s) to {checkpoint_path}')\n", + " print(f'Processed {batch_end:,}/{total:,}; successes={len(climate_rows):,}; errors={len(errors):,}')\n", + " time.sleep(REQUEST_SLEEP_SECONDS)\n", + "\n", + " climate = pd.DataFrame(climate_rows)\n", + " error_log = pd.DataFrame(errors)\n", + " print(f'Summarized {len(climate):,} climate row(s); errors={len(error_log):,}')\n", + " display(climate.head())\n", + " if not error_log.empty:\n", + " display(error_log)\n" + ] + }, + { + "cell_type": "markdown", + "id": "11", + "metadata": {}, + "source": [ + "## Upsert to PostGIS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "UPSERT_COLUMNS = [\n", + " 'master_id', 'source', 'name', 'operator', 'city', 'state', 'country', 'longitude', 'latitude',\n", + " 'open_meteo_model', 'climate_period_start', 'climate_period_end', 'api_timezone',\n", + " 'api_grid_latitude', 'api_grid_longitude', 'api_elevation_m', 'api_utc_offset_seconds',\n", + " 'observation_days', 'observation_years', 'api_generationtime_ms',\n", + " 'mean_annual_temperature_c', 'mean_summer_temperature_c', 'max_daily_temperature_c',\n", + " 'mean_wet_bulb_temperature_c', 'max_wet_bulb_temperature_c', 'mean_relative_humidity_pct',\n", + " 'cooling_degree_days_c', 'annual_cooling_degree_days_c_mean',\n", + " 'extreme_heat_days', 'annual_extreme_heat_days_mean', 'extreme_wet_bulb_days',\n", + " 'mean_diurnal_temperature_range_c', 'precipitation_total_mm',\n", + " 'annual_precipitation_mm_mean', 'annual_precipitation_cv', 'wet_day_precipitation_p95_mm',\n", + " 'windstorm_days', 'annual_windstorm_days_mean', 'max_wind_gust_ms',\n", + " 'mean_soil_moisture_0_to_100cm', 'drought_severity_index_source_status',\n", + " 'wildfire_smoke_exposure_source_status', 'open_meteo_daily_variables', 'open_meteo_url',\n", + "]\n", + "\n", + "\n", + "def clean_db_value(value):\n", + " if isinstance(value, (list, tuple)):\n", + " return list(value)\n", + " if isinstance(value, float) and np.isnan(value):\n", + " return None\n", + " if pd.isna(value):\n", + " return None\n", + " return value\n", + "\n", + "\n", + "def upsert_climate_rows(rows):\n", + " if not rows:\n", + " print('No rows to upsert.')\n", + " return\n", + "\n", + " values = [\n", + " tuple(clean_db_value(row.get(col)) for col in UPSERT_COLUMNS)\n", + " for row in rows\n", + " ]\n", + " col_sql = sql.SQL(', ').join(map(sql.Identifier, UPSERT_COLUMNS))\n", + " update_assignments = sql.SQL(', ').join(\n", + " sql.SQL('{} = excluded.{}').format(sql.Identifier(col), sql.Identifier(col))\n", + " for col in UPSERT_COLUMNS\n", + " if col != 'master_id'\n", + " )\n", + "\n", + " insert_sql = sql.SQL('''\n", + " insert into {table} ({cols}, geom, updated_at)\n", + " values %s\n", + " on conflict (master_id) do update set\n", + " {updates},\n", + " geom = excluded.geom,\n", + " fetched_at = now(),\n", + " updated_at = now()\n", + " ''').format(\n", + " table=sql.SQL(CLIMATE_TABLE),\n", + " cols=col_sql,\n", + " updates=update_assignments,\n", + " )\n", + "\n", + " template = '(' + ', '.join(['%s'] * len(UPSERT_COLUMNS)) + ', ST_SetSRID(ST_MakePoint(%s, %s), 4326), now())'\n", + " values_with_geom = [row + (row[UPSERT_COLUMNS.index('longitude')], row[UPSERT_COLUMNS.index('latitude')]) for row in values]\n", + "\n", + " with get_conn() as conn:\n", + " with conn.cursor() as cur:\n", + " execute_values(cur, insert_sql.as_string(conn), values_with_geom, template=template, page_size=500)\n", + " cur.execute(sql.SQL('analyze {}').format(sql.SQL(CLIMATE_TABLE)))\n", + " print(f'Upserted {len(rows):,} row(s) into {CLIMATE_TABLE}')\n", + "\n", + "\n", + "if DRY_RUN:\n", + " print('DRY_RUN=True; database was not modified.')\n", + "else:\n", + " upsert_climate_rows(climate_rows)\n" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "## Inspect Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "summary_query = f'''\n", + "select\n", + " count(*) as rows,\n", + " min(climate_period_start) as min_period_start,\n", + " max(climate_period_end) as max_period_end,\n", + " count(*) filter (where drought_severity_index_source_status = 'needs_external_source') as drought_external_needed,\n", + " count(*) filter (where wildfire_smoke_exposure_source_status = 'needs_external_source') as smoke_external_needed\n", + "from {CLIMATE_TABLE}\n", + "where open_meteo_model = %s\n", + " and climate_period_start = %s::date\n", + " and climate_period_end = %s::date\n", + "'''\n", + "\n", + "sample_query = f'''\n", + "select\n", + " c.master_id,\n", + " c.name,\n", + " c.state,\n", + " c.mean_annual_temperature_c,\n", + " c.mean_summer_temperature_c,\n", + " c.cooling_degree_days_c,\n", + " c.extreme_heat_days,\n", + " c.annual_precipitation_cv,\n", + " c.windstorm_days\n", + "from {CLIMATE_TABLE} c\n", + "order by c.cooling_degree_days_c desc nulls last\n", + "limit 20\n", + "'''\n", + "\n", + "with get_conn() as conn:\n", + " summary = pd.read_sql_query(summary_query, conn, params=(OPEN_METEO_MODEL, START_DATE, END_DATE))\n", + " sample = pd.read_sql_query(sample_query, conn)\n", + "\n", + "display(summary)\n", + "display(sample)\n" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Notes for the Next Data Sources\n", + "\n", + "Open-Meteo gets us the weather/climate burden metrics and a wind-gust proxy for windstorm frequency. These still need better external sources:\n", + "\n", + "- Drought severity index: NOAA/NCEI climate division or gridMET/USDM-derived drought measures are better candidates.\n", + "- Wildfire smoke exposure: NOAA HMS smoke, EPA AirNow/monitor-derived PM2.5, or satellite smoke products are better candidates.\n", + "- Windstorm frequency: the current field is a reanalysis gust threshold proxy. FEMA/NCEI storm events can be added later as observed hazard-event counts.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}