Add EIA SEDS ingestion support
This commit is contained in:
@@ -115,3 +115,42 @@ Update [`output/facility_fuel_pending_narrative.txt`](../output/facility_fuel_pe
|
|||||||
once facility-fuel is actually ingested. Replace the "Pending" framing with
|
once facility-fuel is actually ingested. Replace the "Pending" framing with
|
||||||
the real row count, period range, and any column-mapping notes from Step 3.
|
the real row count, period range, and any column-mapping notes from Step 3.
|
||||||
Mirror the format of `operating_generator_capacity_sample.txt`.
|
Mirror the format of `operating_generator_capacity_sample.txt`.
|
||||||
|
|
||||||
|
## New endpoint added — SEDS (State Energy Data System)
|
||||||
|
|
||||||
|
Wired up 2026-05-17. Endpoint: `seds` (annual frequency,
|
||||||
|
`https://api.eia.gov/v2/seds/data/`). Probed live, columns verified, smoke
|
||||||
|
test of 50 rows landed in `public.energy_eia_seds_flat` with all typed
|
||||||
|
columns populated.
|
||||||
|
|
||||||
|
**Verified JSON keys** (no sector field — sector is encoded in `seriesId`):
|
||||||
|
`period` (YYYY), `seriesId`, `seriesDescription`, `stateId`,
|
||||||
|
`stateDescription`, `value`, `unit`.
|
||||||
|
|
||||||
|
**Total volume:** ~2.57M rows across 65 years (1960–2024), ~40k rows/year.
|
||||||
|
Ingested year-by-year via the generalized `fetch_eia_pages_by_period` to
|
||||||
|
stay under EIA's 503 threshold (same pattern as the monthly endpoints).
|
||||||
|
|
||||||
|
**What to verify Monday:**
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Expected: ~2.5M+ rows, 1960 → 2024
|
||||||
|
select count(*), min(year), max(year)
|
||||||
|
from public.energy_eia_seds_flat;
|
||||||
|
|
||||||
|
-- Spot-check that typed columns landed (not all NULL)
|
||||||
|
select period, year, series_id, state_id, value, unit
|
||||||
|
from public.energy_eia_seds_flat
|
||||||
|
order by random()
|
||||||
|
limit 5;
|
||||||
|
```
|
||||||
|
|
||||||
|
If row count is way under 2.5M, suspect a mid-run failure — check the log
|
||||||
|
for `503` errors on the `seds` endpoint and re-run with
|
||||||
|
`python3 ingest_eia_energy_layers.py --category state_energy --endpoint seds`.
|
||||||
|
|
||||||
|
**Product/API docs for reference:**
|
||||||
|
|
||||||
|
- Product page: https://www.eia.gov/state/seds/
|
||||||
|
- Technical notes: https://www.eia.gov/state/seds/seds-technical-notes-complete.php
|
||||||
|
- API documentation: https://www.eia.gov/opendata/documentation.php
|
||||||
|
|||||||
@@ -62,17 +62,39 @@ EIA_DATASETS = {
|
|||||||
"electricity/facility-fuel",
|
"electricity/facility-fuel",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
"state_energy": {
|
||||||
|
"category": "state_energy",
|
||||||
|
"endpoints": [
|
||||||
|
# State Energy Data System (SEDS): annual state-level production,
|
||||||
|
# consumption, price, and expenditure across all energy sources.
|
||||||
|
"seds",
|
||||||
|
],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extra data fields (the EIA `data[N]=` query params) each endpoint needs.
|
# Extra data fields (the EIA `data[N]=` query params) each endpoint needs.
|
||||||
# operating-generator-capacity returns only id columns by default; latitude/longitude
|
# operating-generator-capacity returns only id columns by default; latitude/longitude
|
||||||
# must be requested explicitly. facility-fuel returns only id columns; generation
|
# must be requested explicitly. facility-fuel returns only id columns; generation
|
||||||
# values must be requested explicitly.
|
# values must be requested explicitly. seds returns only id columns; the numeric
|
||||||
|
# value column must be requested explicitly.
|
||||||
EIA_DATASET_DATA_FIELDS = {
|
EIA_DATASET_DATA_FIELDS = {
|
||||||
"electricity/operating-generator-capacity": ["latitude", "longitude"],
|
"electricity/operating-generator-capacity": ["latitude", "longitude"],
|
||||||
"electricity/facility-fuel": ["generation", "gross-generation"],
|
"electricity/facility-fuel": ["generation", "gross-generation"],
|
||||||
|
"seds": ["value"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Frequency for each endpoint. Drives how period range is discovered and how
|
||||||
|
# pagination iterates. Endpoints not listed default to "monthly".
|
||||||
|
EIA_DATASET_FREQUENCY = {
|
||||||
|
"electricity/operating-generator-capacity": "monthly",
|
||||||
|
"electricity/facility-fuel": "monthly",
|
||||||
|
"seds": "annual",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def endpoint_frequency(endpoint: str) -> str:
|
||||||
|
return EIA_DATASET_FREQUENCY.get(endpoint, "monthly")
|
||||||
|
|
||||||
# Endpoints that do not reliably support retry with ad-hoc data[] field requests.
|
# Endpoints that do not reliably support retry with ad-hoc data[] field requests.
|
||||||
EIA_NO_RETRY_EXTRA_FIELDS = {
|
EIA_NO_RETRY_EXTRA_FIELDS = {
|
||||||
}
|
}
|
||||||
@@ -135,12 +157,28 @@ def iter_months(start: str, end: str):
|
|||||||
y += 1
|
y += 1
|
||||||
|
|
||||||
|
|
||||||
def discover_period_range(endpoint: str) -> tuple:
|
def iter_years(start: str, end: str):
|
||||||
"""Return (earliest, latest) 'YYYY-MM' period strings for an endpoint.
|
"""Yield 'YYYY' strings from start to end inclusive."""
|
||||||
|
sy = int(start[:4])
|
||||||
|
ey = int(end[:4])
|
||||||
|
for y in range(sy, ey + 1):
|
||||||
|
yield f"{y:04d}"
|
||||||
|
|
||||||
Forces frequency=monthly so endpoints that also publish annual/quarterly
|
|
||||||
series (e.g. facility-fuel) don't return non-monthly period formats that
|
def iter_periods(frequency: str, start: str, end: str):
|
||||||
break iter_months. Routes through query_eia_api for retry/backoff coverage.
|
if frequency == "annual":
|
||||||
|
yield from iter_years(start, end)
|
||||||
|
else:
|
||||||
|
yield from iter_months(start, end)
|
||||||
|
|
||||||
|
|
||||||
|
def discover_period_range(endpoint: str, frequency: str = "monthly") -> tuple:
|
||||||
|
"""Return (earliest, latest) period strings for an endpoint.
|
||||||
|
|
||||||
|
Forces an explicit frequency so endpoints that publish multiple frequencies
|
||||||
|
(e.g. facility-fuel monthly+annual) return periods in the expected format.
|
||||||
|
Monthly endpoints get 'YYYY-MM'; annual endpoints (e.g. SEDS) get 'YYYY'.
|
||||||
|
Routes through query_eia_api for retry/backoff coverage.
|
||||||
"""
|
"""
|
||||||
def _one(direction: str) -> str:
|
def _one(direction: str) -> str:
|
||||||
data = query_eia_api(
|
data = query_eia_api(
|
||||||
@@ -150,7 +188,7 @@ def discover_period_range(endpoint: str) -> tuple:
|
|||||||
"sort[0][column]": "period",
|
"sort[0][column]": "period",
|
||||||
"sort[0][direction]": direction,
|
"sort[0][direction]": direction,
|
||||||
},
|
},
|
||||||
query_params={"frequency": "monthly"},
|
query_params={"frequency": frequency},
|
||||||
)
|
)
|
||||||
rows = (data or {}).get("response", {}).get("data", [])
|
rows = (data or {}).get("response", {}).get("data", [])
|
||||||
if not rows:
|
if not rows:
|
||||||
@@ -287,33 +325,35 @@ def fetch_eia_pages(
|
|||||||
offset += len(page_records)
|
offset += len(page_records)
|
||||||
|
|
||||||
|
|
||||||
def fetch_eia_pages_by_month(
|
def fetch_eia_pages_by_period(
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
|
frequency: str,
|
||||||
earliest: str,
|
earliest: str,
|
||||||
latest: str,
|
latest: str,
|
||||||
max_records: int = 0,
|
max_records: int = 0,
|
||||||
extra_data_fields: Optional[List[str]] = None,
|
extra_data_fields: Optional[List[str]] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
"""Yield pages across months, querying one month at a time.
|
"""Yield pages across periods, querying one period (month or year) at a time.
|
||||||
|
|
||||||
EIA's bulk endpoints serve large offsets slowly and return frequent 503s
|
EIA's bulk endpoints serve large offsets slowly and return frequent 503s
|
||||||
under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps
|
under sustained load. Filtering by &frequency=F&start=X&end=X keeps each
|
||||||
each query small (~17k–28k rows per month for operating-generator-capacity)
|
query small (~17k–28k rows per month for operating-generator-capacity,
|
||||||
and dramatically reduces failure rate and wall time.
|
~40k rows per year for SEDS) and dramatically reduces failure rate and
|
||||||
|
wall time.
|
||||||
"""
|
"""
|
||||||
yielded = 0
|
yielded = 0
|
||||||
for month in iter_months(earliest, latest):
|
for period in iter_periods(frequency, earliest, latest):
|
||||||
if max_records > 0 and yielded >= max_records:
|
if max_records > 0 and yielded >= max_records:
|
||||||
return
|
return
|
||||||
remaining = max_records - yielded if max_records > 0 else 0
|
remaining = max_records - yielded if max_records > 0 else 0
|
||||||
month_params = {"frequency": "monthly", "start": month, "end": month}
|
period_params = {"frequency": frequency, "start": period, "end": period}
|
||||||
for page_records, used_extra_fields in fetch_eia_pages(
|
for page_records, used_extra_fields in fetch_eia_pages(
|
||||||
endpoint,
|
endpoint,
|
||||||
max_records=remaining,
|
max_records=remaining,
|
||||||
extra_data_fields=extra_data_fields,
|
extra_data_fields=extra_data_fields,
|
||||||
query_params=month_params,
|
query_params=period_params,
|
||||||
):
|
):
|
||||||
yield page_records, used_extra_fields, month
|
yield page_records, used_extra_fields, period
|
||||||
yielded += len(page_records)
|
yielded += len(page_records)
|
||||||
if max_records > 0 and yielded >= max_records:
|
if max_records > 0 and yielded >= max_records:
|
||||||
return
|
return
|
||||||
@@ -363,26 +403,28 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i
|
|||||||
conn = connect_db()
|
conn = connect_db()
|
||||||
try:
|
try:
|
||||||
extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint)
|
extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint)
|
||||||
|
frequency = endpoint_frequency(dataset.api_endpoint)
|
||||||
|
|
||||||
earliest, latest = discover_period_range(dataset.api_endpoint)
|
earliest, latest = discover_period_range(dataset.api_endpoint, frequency)
|
||||||
print(f" period range: {earliest} -> {latest}")
|
print(f" period range ({frequency}): {earliest} -> {latest}")
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
geo_count = 0
|
geo_count = 0
|
||||||
initialized = False
|
initialized = False
|
||||||
current_month: Optional[str] = None
|
current_period: Optional[str] = None
|
||||||
|
|
||||||
for page_records, used_extra_fields, month in fetch_eia_pages_by_month(
|
for page_records, used_extra_fields, period in fetch_eia_pages_by_period(
|
||||||
dataset.api_endpoint,
|
dataset.api_endpoint,
|
||||||
|
frequency=frequency,
|
||||||
earliest=earliest,
|
earliest=earliest,
|
||||||
latest=latest,
|
latest=latest,
|
||||||
max_records=max_records,
|
max_records=max_records,
|
||||||
extra_data_fields=extra_fields,
|
extra_data_fields=extra_fields,
|
||||||
):
|
):
|
||||||
if month != current_month:
|
if period != current_period:
|
||||||
if current_month is not None:
|
if current_period is not None:
|
||||||
print(f" progress: {count} rows ingested through {current_month}")
|
print(f" progress: {count} rows ingested through {current_period}")
|
||||||
current_month = month
|
current_period = period
|
||||||
if not initialized:
|
if not initialized:
|
||||||
with conn:
|
with conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -780,7 +822,8 @@ def build_flat_tables(conn):
|
|||||||
where table_schema='public'
|
where table_schema='public'
|
||||||
and table_name in (
|
and table_name in (
|
||||||
'energy_eia_electricity_operating_generator_capacity',
|
'energy_eia_electricity_operating_generator_capacity',
|
||||||
'energy_eia_electricity_facility_fuel'
|
'energy_eia_electricity_facility_fuel',
|
||||||
|
'energy_eia_seds'
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
@@ -916,6 +959,49 @@ def build_flat_tables(conn):
|
|||||||
)
|
)
|
||||||
cur.execute("analyze public.energy_eia_facility_fuel_flat")
|
cur.execute("analyze public.energy_eia_facility_fuel_flat")
|
||||||
|
|
||||||
|
if "energy_eia_seds" in available:
|
||||||
|
# SEDS column mapping verified 2026-05-17 via length=5 probe of
|
||||||
|
# https://api.eia.gov/v2/seds/data/. Confirmed keys: period (YYYY),
|
||||||
|
# seriesId, seriesDescription, stateId, stateDescription, value, unit.
|
||||||
|
# No sector field — sector is encoded in seriesId.
|
||||||
|
cur.execute("drop table if exists public.energy_eia_seds_flat")
|
||||||
|
cur.execute(
|
||||||
|
r"""
|
||||||
|
create table public.energy_eia_seds_flat as
|
||||||
|
select
|
||||||
|
gid,
|
||||||
|
properties->>'period' as period,
|
||||||
|
case
|
||||||
|
when (properties->>'period') ~ '^[0-9]{4}$'
|
||||||
|
then (properties->>'period')::integer
|
||||||
|
end as year,
|
||||||
|
properties->>'seriesId' as series_id,
|
||||||
|
properties->>'seriesDescription' as series_description,
|
||||||
|
properties->>'stateId' as state_id,
|
||||||
|
properties->>'stateDescription' as state_name,
|
||||||
|
case
|
||||||
|
when (properties->>'value') ~ '^-?[0-9]+(\.[0-9]+)?$'
|
||||||
|
then (properties->>'value')::double precision
|
||||||
|
end as value,
|
||||||
|
properties->>'unit' as unit,
|
||||||
|
properties as raw_properties
|
||||||
|
from public.energy_eia_seds
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"create index energy_eia_seds_flat_state_id_idx "
|
||||||
|
"on public.energy_eia_seds_flat (state_id)"
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"create index energy_eia_seds_flat_series_id_idx "
|
||||||
|
"on public.energy_eia_seds_flat (series_id)"
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"create index energy_eia_seds_flat_year_idx "
|
||||||
|
"on public.energy_eia_seds_flat (year)"
|
||||||
|
)
|
||||||
|
cur.execute("analyze public.energy_eia_seds_flat")
|
||||||
|
|
||||||
|
|
||||||
def prune_stale_layer_versions(conn) -> int:
|
def prune_stale_layer_versions(conn) -> int:
|
||||||
"""Drop superseded EIA layer tables and remove stale catalog rows.
|
"""Drop superseded EIA layer tables and remove stale catalog rows.
|
||||||
@@ -1034,6 +1120,7 @@ def prune_unselected_layers(conn, selected_table_names: List[str]) -> int:
|
|||||||
FINAL_FLAT_TABLES = (
|
FINAL_FLAT_TABLES = (
|
||||||
"energy_eia_operating_generator_capacity_flat",
|
"energy_eia_operating_generator_capacity_flat",
|
||||||
"energy_eia_facility_fuel_flat",
|
"energy_eia_facility_fuel_flat",
|
||||||
|
"energy_eia_seds_flat",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -1076,7 +1163,7 @@ def parse_args():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--category",
|
"--category",
|
||||||
choices=["power", "all"],
|
choices=["power", "state_energy", "all"],
|
||||||
default="power",
|
default="power",
|
||||||
help="Infrastructure category to ingest.",
|
help="Infrastructure category to ingest.",
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user