diff --git a/.claude/MONDAY_CHECKLIST.md b/.claude/MONDAY_CHECKLIST.md index fb4948c..4cba94f 100644 --- a/.claude/MONDAY_CHECKLIST.md +++ b/.claude/MONDAY_CHECKLIST.md @@ -115,3 +115,42 @@ Update [`output/facility_fuel_pending_narrative.txt`](../output/facility_fuel_pe once facility-fuel is actually ingested. Replace the "Pending" framing with the real row count, period range, and any column-mapping notes from Step 3. Mirror the format of `operating_generator_capacity_sample.txt`. + +## New endpoint added — SEDS (State Energy Data System) + +Wired up 2026-05-17. Endpoint: `seds` (annual frequency, +`https://api.eia.gov/v2/seds/data/`). Probed live, columns verified, smoke +test of 50 rows landed in `public.energy_eia_seds_flat` with all typed +columns populated. + +**Verified JSON keys** (no sector field — sector is encoded in `seriesId`): +`period` (YYYY), `seriesId`, `seriesDescription`, `stateId`, +`stateDescription`, `value`, `unit`. + +**Total volume:** ~2.57M rows across 65 years (1960–2024), ~40k rows/year. +Ingested year-by-year via the generalized `fetch_eia_pages_by_period` to +stay under EIA's 503 threshold (same pattern as the monthly endpoints). + +**What to verify Monday:** + +```sql +-- Expected: ~2.5M+ rows, 1960 → 2024 +select count(*), min(year), max(year) +from public.energy_eia_seds_flat; + +-- Spot-check that typed columns landed (not all NULL) +select period, year, series_id, state_id, value, unit +from public.energy_eia_seds_flat +order by random() +limit 5; +``` + +If row count is way under 2.5M, suspect a mid-run failure — check the log +for `503` errors on the `seds` endpoint and re-run with +`python3 ingest_eia_energy_layers.py --category state_energy --endpoint seds`. + +**Product/API docs for reference:** + +- Product page: https://www.eia.gov/state/seds/ +- Technical notes: https://www.eia.gov/state/seds/seds-technical-notes-complete.php +- API documentation: https://www.eia.gov/opendata/documentation.php diff --git a/ingest_eia_energy_layers.py b/ingest_eia_energy_layers.py index 0ff5b62..3f6a8b3 100644 --- a/ingest_eia_energy_layers.py +++ b/ingest_eia_energy_layers.py @@ -62,17 +62,39 @@ EIA_DATASETS = { "electricity/facility-fuel", ], }, + "state_energy": { + "category": "state_energy", + "endpoints": [ + # State Energy Data System (SEDS): annual state-level production, + # consumption, price, and expenditure across all energy sources. + "seds", + ], + }, } # Extra data fields (the EIA `data[N]=` query params) each endpoint needs. # operating-generator-capacity returns only id columns by default; latitude/longitude # must be requested explicitly. facility-fuel returns only id columns; generation -# values must be requested explicitly. +# values must be requested explicitly. seds returns only id columns; the numeric +# value column must be requested explicitly. EIA_DATASET_DATA_FIELDS = { "electricity/operating-generator-capacity": ["latitude", "longitude"], "electricity/facility-fuel": ["generation", "gross-generation"], + "seds": ["value"], } +# Frequency for each endpoint. Drives how period range is discovered and how +# pagination iterates. Endpoints not listed default to "monthly". +EIA_DATASET_FREQUENCY = { + "electricity/operating-generator-capacity": "monthly", + "electricity/facility-fuel": "monthly", + "seds": "annual", +} + + +def endpoint_frequency(endpoint: str) -> str: + return EIA_DATASET_FREQUENCY.get(endpoint, "monthly") + # Endpoints that do not reliably support retry with ad-hoc data[] field requests. EIA_NO_RETRY_EXTRA_FIELDS = { } @@ -135,12 +157,28 @@ def iter_months(start: str, end: str): y += 1 -def discover_period_range(endpoint: str) -> tuple: - """Return (earliest, latest) 'YYYY-MM' period strings for an endpoint. +def iter_years(start: str, end: str): + """Yield 'YYYY' strings from start to end inclusive.""" + sy = int(start[:4]) + ey = int(end[:4]) + for y in range(sy, ey + 1): + yield f"{y:04d}" - Forces frequency=monthly so endpoints that also publish annual/quarterly - series (e.g. facility-fuel) don't return non-monthly period formats that - break iter_months. Routes through query_eia_api for retry/backoff coverage. + +def iter_periods(frequency: str, start: str, end: str): + if frequency == "annual": + yield from iter_years(start, end) + else: + yield from iter_months(start, end) + + +def discover_period_range(endpoint: str, frequency: str = "monthly") -> tuple: + """Return (earliest, latest) period strings for an endpoint. + + Forces an explicit frequency so endpoints that publish multiple frequencies + (e.g. facility-fuel monthly+annual) return periods in the expected format. + Monthly endpoints get 'YYYY-MM'; annual endpoints (e.g. SEDS) get 'YYYY'. + Routes through query_eia_api for retry/backoff coverage. """ def _one(direction: str) -> str: data = query_eia_api( @@ -150,7 +188,7 @@ def discover_period_range(endpoint: str) -> tuple: "sort[0][column]": "period", "sort[0][direction]": direction, }, - query_params={"frequency": "monthly"}, + query_params={"frequency": frequency}, ) rows = (data or {}).get("response", {}).get("data", []) if not rows: @@ -287,33 +325,35 @@ def fetch_eia_pages( offset += len(page_records) -def fetch_eia_pages_by_month( +def fetch_eia_pages_by_period( endpoint: str, + frequency: str, earliest: str, latest: str, max_records: int = 0, extra_data_fields: Optional[List[str]] = None, ) -> Any: - """Yield pages across months, querying one month at a time. + """Yield pages across periods, querying one period (month or year) at a time. EIA's bulk endpoints serve large offsets slowly and return frequent 503s - under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps - each query small (~17k–28k rows per month for operating-generator-capacity) - and dramatically reduces failure rate and wall time. + under sustained load. Filtering by &frequency=F&start=X&end=X keeps each + query small (~17k–28k rows per month for operating-generator-capacity, + ~40k rows per year for SEDS) and dramatically reduces failure rate and + wall time. """ yielded = 0 - for month in iter_months(earliest, latest): + for period in iter_periods(frequency, earliest, latest): if max_records > 0 and yielded >= max_records: return remaining = max_records - yielded if max_records > 0 else 0 - month_params = {"frequency": "monthly", "start": month, "end": month} + period_params = {"frequency": frequency, "start": period, "end": period} for page_records, used_extra_fields in fetch_eia_pages( endpoint, max_records=remaining, extra_data_fields=extra_data_fields, - query_params=month_params, + query_params=period_params, ): - yield page_records, used_extra_fields, month + yield page_records, used_extra_fields, period yielded += len(page_records) if max_records > 0 and yielded >= max_records: return @@ -363,26 +403,28 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i conn = connect_db() try: extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint) + frequency = endpoint_frequency(dataset.api_endpoint) - earliest, latest = discover_period_range(dataset.api_endpoint) - print(f" period range: {earliest} -> {latest}") + earliest, latest = discover_period_range(dataset.api_endpoint, frequency) + print(f" period range ({frequency}): {earliest} -> {latest}") count = 0 geo_count = 0 initialized = False - current_month: Optional[str] = None + current_period: Optional[str] = None - for page_records, used_extra_fields, month in fetch_eia_pages_by_month( + for page_records, used_extra_fields, period in fetch_eia_pages_by_period( dataset.api_endpoint, + frequency=frequency, earliest=earliest, latest=latest, max_records=max_records, extra_data_fields=extra_fields, ): - if month != current_month: - if current_month is not None: - print(f" progress: {count} rows ingested through {current_month}") - current_month = month + if period != current_period: + if current_period is not None: + print(f" progress: {count} rows ingested through {current_period}") + current_period = period if not initialized: with conn: with conn.cursor() as cur: @@ -780,7 +822,8 @@ def build_flat_tables(conn): where table_schema='public' and table_name in ( 'energy_eia_electricity_operating_generator_capacity', - 'energy_eia_electricity_facility_fuel' + 'energy_eia_electricity_facility_fuel', + 'energy_eia_seds' ) """ ) @@ -916,6 +959,49 @@ def build_flat_tables(conn): ) cur.execute("analyze public.energy_eia_facility_fuel_flat") + if "energy_eia_seds" in available: + # SEDS column mapping verified 2026-05-17 via length=5 probe of + # https://api.eia.gov/v2/seds/data/. Confirmed keys: period (YYYY), + # seriesId, seriesDescription, stateId, stateDescription, value, unit. + # No sector field — sector is encoded in seriesId. + cur.execute("drop table if exists public.energy_eia_seds_flat") + cur.execute( + r""" + create table public.energy_eia_seds_flat as + select + gid, + properties->>'period' as period, + case + when (properties->>'period') ~ '^[0-9]{4}$' + then (properties->>'period')::integer + end as year, + properties->>'seriesId' as series_id, + properties->>'seriesDescription' as series_description, + properties->>'stateId' as state_id, + properties->>'stateDescription' as state_name, + case + when (properties->>'value') ~ '^-?[0-9]+(\.[0-9]+)?$' + then (properties->>'value')::double precision + end as value, + properties->>'unit' as unit, + properties as raw_properties + from public.energy_eia_seds + """ + ) + cur.execute( + "create index energy_eia_seds_flat_state_id_idx " + "on public.energy_eia_seds_flat (state_id)" + ) + cur.execute( + "create index energy_eia_seds_flat_series_id_idx " + "on public.energy_eia_seds_flat (series_id)" + ) + cur.execute( + "create index energy_eia_seds_flat_year_idx " + "on public.energy_eia_seds_flat (year)" + ) + cur.execute("analyze public.energy_eia_seds_flat") + def prune_stale_layer_versions(conn) -> int: """Drop superseded EIA layer tables and remove stale catalog rows. @@ -1034,6 +1120,7 @@ def prune_unselected_layers(conn, selected_table_names: List[str]) -> int: FINAL_FLAT_TABLES = ( "energy_eia_operating_generator_capacity_flat", "energy_eia_facility_fuel_flat", + "energy_eia_seds_flat", ) @@ -1076,7 +1163,7 @@ def parse_args(): ) parser.add_argument( "--category", - choices=["power", "all"], + choices=["power", "state_energy", "all"], default="power", help="Infrastructure category to ingest.", )