Add EIA SEDS ingestion support

This commit is contained in:
2026-05-17 18:52:29 -07:00
parent 614b10b43f
commit 48f23af5b0
2 changed files with 152 additions and 26 deletions

View File

@@ -115,3 +115,42 @@ Update [`output/facility_fuel_pending_narrative.txt`](../output/facility_fuel_pe
once facility-fuel is actually ingested. Replace the "Pending" framing with
the real row count, period range, and any column-mapping notes from Step 3.
Mirror the format of `operating_generator_capacity_sample.txt`.
## New endpoint added — SEDS (State Energy Data System)
Wired up 2026-05-17. Endpoint: `seds` (annual frequency,
`https://api.eia.gov/v2/seds/data/`). Probed live, columns verified, smoke
test of 50 rows landed in `public.energy_eia_seds_flat` with all typed
columns populated.
**Verified JSON keys** (no sector field — sector is encoded in `seriesId`):
`period` (YYYY), `seriesId`, `seriesDescription`, `stateId`,
`stateDescription`, `value`, `unit`.
**Total volume:** ~2.57M rows across 65 years (19602024), ~40k rows/year.
Ingested year-by-year via the generalized `fetch_eia_pages_by_period` to
stay under EIA's 503 threshold (same pattern as the monthly endpoints).
**What to verify Monday:**
```sql
-- Expected: ~2.5M+ rows, 1960 → 2024
select count(*), min(year), max(year)
from public.energy_eia_seds_flat;
-- Spot-check that typed columns landed (not all NULL)
select period, year, series_id, state_id, value, unit
from public.energy_eia_seds_flat
order by random()
limit 5;
```
If row count is way under 2.5M, suspect a mid-run failure — check the log
for `503` errors on the `seds` endpoint and re-run with
`python3 ingest_eia_energy_layers.py --category state_energy --endpoint seds`.
**Product/API docs for reference:**
- Product page: https://www.eia.gov/state/seds/
- Technical notes: https://www.eia.gov/state/seds/seds-technical-notes-complete.php
- API documentation: https://www.eia.gov/opendata/documentation.php

View File

@@ -62,17 +62,39 @@ EIA_DATASETS = {
"electricity/facility-fuel",
],
},
"state_energy": {
"category": "state_energy",
"endpoints": [
# State Energy Data System (SEDS): annual state-level production,
# consumption, price, and expenditure across all energy sources.
"seds",
],
},
}
# Extra data fields (the EIA `data[N]=` query params) each endpoint needs.
# operating-generator-capacity returns only id columns by default; latitude/longitude
# must be requested explicitly. facility-fuel returns only id columns; generation
# values must be requested explicitly.
# values must be requested explicitly. seds returns only id columns; the numeric
# value column must be requested explicitly.
EIA_DATASET_DATA_FIELDS = {
"electricity/operating-generator-capacity": ["latitude", "longitude"],
"electricity/facility-fuel": ["generation", "gross-generation"],
"seds": ["value"],
}
# Frequency for each endpoint. Drives how period range is discovered and how
# pagination iterates. Endpoints not listed default to "monthly".
EIA_DATASET_FREQUENCY = {
"electricity/operating-generator-capacity": "monthly",
"electricity/facility-fuel": "monthly",
"seds": "annual",
}
def endpoint_frequency(endpoint: str) -> str:
return EIA_DATASET_FREQUENCY.get(endpoint, "monthly")
# Endpoints that do not reliably support retry with ad-hoc data[] field requests.
EIA_NO_RETRY_EXTRA_FIELDS = {
}
@@ -135,12 +157,28 @@ def iter_months(start: str, end: str):
y += 1
def discover_period_range(endpoint: str) -> tuple:
"""Return (earliest, latest) 'YYYY-MM' period strings for an endpoint.
def iter_years(start: str, end: str):
"""Yield 'YYYY' strings from start to end inclusive."""
sy = int(start[:4])
ey = int(end[:4])
for y in range(sy, ey + 1):
yield f"{y:04d}"
Forces frequency=monthly so endpoints that also publish annual/quarterly
series (e.g. facility-fuel) don't return non-monthly period formats that
break iter_months. Routes through query_eia_api for retry/backoff coverage.
def iter_periods(frequency: str, start: str, end: str):
if frequency == "annual":
yield from iter_years(start, end)
else:
yield from iter_months(start, end)
def discover_period_range(endpoint: str, frequency: str = "monthly") -> tuple:
"""Return (earliest, latest) period strings for an endpoint.
Forces an explicit frequency so endpoints that publish multiple frequencies
(e.g. facility-fuel monthly+annual) return periods in the expected format.
Monthly endpoints get 'YYYY-MM'; annual endpoints (e.g. SEDS) get 'YYYY'.
Routes through query_eia_api for retry/backoff coverage.
"""
def _one(direction: str) -> str:
data = query_eia_api(
@@ -150,7 +188,7 @@ def discover_period_range(endpoint: str) -> tuple:
"sort[0][column]": "period",
"sort[0][direction]": direction,
},
query_params={"frequency": "monthly"},
query_params={"frequency": frequency},
)
rows = (data or {}).get("response", {}).get("data", [])
if not rows:
@@ -287,33 +325,35 @@ def fetch_eia_pages(
offset += len(page_records)
def fetch_eia_pages_by_month(
def fetch_eia_pages_by_period(
endpoint: str,
frequency: str,
earliest: str,
latest: str,
max_records: int = 0,
extra_data_fields: Optional[List[str]] = None,
) -> Any:
"""Yield pages across months, querying one month at a time.
"""Yield pages across periods, querying one period (month or year) at a time.
EIA's bulk endpoints serve large offsets slowly and return frequent 503s
under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps
each query small (~17k28k rows per month for operating-generator-capacity)
and dramatically reduces failure rate and wall time.
under sustained load. Filtering by &frequency=F&start=X&end=X keeps each
query small (~17k28k rows per month for operating-generator-capacity,
~40k rows per year for SEDS) and dramatically reduces failure rate and
wall time.
"""
yielded = 0
for month in iter_months(earliest, latest):
for period in iter_periods(frequency, earliest, latest):
if max_records > 0 and yielded >= max_records:
return
remaining = max_records - yielded if max_records > 0 else 0
month_params = {"frequency": "monthly", "start": month, "end": month}
period_params = {"frequency": frequency, "start": period, "end": period}
for page_records, used_extra_fields in fetch_eia_pages(
endpoint,
max_records=remaining,
extra_data_fields=extra_data_fields,
query_params=month_params,
query_params=period_params,
):
yield page_records, used_extra_fields, month
yield page_records, used_extra_fields, period
yielded += len(page_records)
if max_records > 0 and yielded >= max_records:
return
@@ -363,26 +403,28 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i
conn = connect_db()
try:
extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint)
frequency = endpoint_frequency(dataset.api_endpoint)
earliest, latest = discover_period_range(dataset.api_endpoint)
print(f" period range: {earliest} -> {latest}")
earliest, latest = discover_period_range(dataset.api_endpoint, frequency)
print(f" period range ({frequency}): {earliest} -> {latest}")
count = 0
geo_count = 0
initialized = False
current_month: Optional[str] = None
current_period: Optional[str] = None
for page_records, used_extra_fields, month in fetch_eia_pages_by_month(
for page_records, used_extra_fields, period in fetch_eia_pages_by_period(
dataset.api_endpoint,
frequency=frequency,
earliest=earliest,
latest=latest,
max_records=max_records,
extra_data_fields=extra_fields,
):
if month != current_month:
if current_month is not None:
print(f" progress: {count} rows ingested through {current_month}")
current_month = month
if period != current_period:
if current_period is not None:
print(f" progress: {count} rows ingested through {current_period}")
current_period = period
if not initialized:
with conn:
with conn.cursor() as cur:
@@ -780,7 +822,8 @@ def build_flat_tables(conn):
where table_schema='public'
and table_name in (
'energy_eia_electricity_operating_generator_capacity',
'energy_eia_electricity_facility_fuel'
'energy_eia_electricity_facility_fuel',
'energy_eia_seds'
)
"""
)
@@ -916,6 +959,49 @@ def build_flat_tables(conn):
)
cur.execute("analyze public.energy_eia_facility_fuel_flat")
if "energy_eia_seds" in available:
# SEDS column mapping verified 2026-05-17 via length=5 probe of
# https://api.eia.gov/v2/seds/data/. Confirmed keys: period (YYYY),
# seriesId, seriesDescription, stateId, stateDescription, value, unit.
# No sector field — sector is encoded in seriesId.
cur.execute("drop table if exists public.energy_eia_seds_flat")
cur.execute(
r"""
create table public.energy_eia_seds_flat as
select
gid,
properties->>'period' as period,
case
when (properties->>'period') ~ '^[0-9]{4}$'
then (properties->>'period')::integer
end as year,
properties->>'seriesId' as series_id,
properties->>'seriesDescription' as series_description,
properties->>'stateId' as state_id,
properties->>'stateDescription' as state_name,
case
when (properties->>'value') ~ '^-?[0-9]+(\.[0-9]+)?$'
then (properties->>'value')::double precision
end as value,
properties->>'unit' as unit,
properties as raw_properties
from public.energy_eia_seds
"""
)
cur.execute(
"create index energy_eia_seds_flat_state_id_idx "
"on public.energy_eia_seds_flat (state_id)"
)
cur.execute(
"create index energy_eia_seds_flat_series_id_idx "
"on public.energy_eia_seds_flat (series_id)"
)
cur.execute(
"create index energy_eia_seds_flat_year_idx "
"on public.energy_eia_seds_flat (year)"
)
cur.execute("analyze public.energy_eia_seds_flat")
def prune_stale_layer_versions(conn) -> int:
"""Drop superseded EIA layer tables and remove stale catalog rows.
@@ -1034,6 +1120,7 @@ def prune_unselected_layers(conn, selected_table_names: List[str]) -> int:
FINAL_FLAT_TABLES = (
"energy_eia_operating_generator_capacity_flat",
"energy_eia_facility_fuel_flat",
"energy_eia_seds_flat",
)
@@ -1076,7 +1163,7 @@ def parse_args():
)
parser.add_argument(
"--category",
choices=["power", "all"],
choices=["power", "state_energy", "all"],
default="power",
help="Infrastructure category to ingest.",
)