Add EIA SEDS ingestion support
This commit is contained in:
@@ -62,17 +62,39 @@ EIA_DATASETS = {
|
||||
"electricity/facility-fuel",
|
||||
],
|
||||
},
|
||||
"state_energy": {
|
||||
"category": "state_energy",
|
||||
"endpoints": [
|
||||
# State Energy Data System (SEDS): annual state-level production,
|
||||
# consumption, price, and expenditure across all energy sources.
|
||||
"seds",
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
# Extra data fields (the EIA `data[N]=` query params) each endpoint needs.
|
||||
# operating-generator-capacity returns only id columns by default; latitude/longitude
|
||||
# must be requested explicitly. facility-fuel returns only id columns; generation
|
||||
# values must be requested explicitly.
|
||||
# values must be requested explicitly. seds returns only id columns; the numeric
|
||||
# value column must be requested explicitly.
|
||||
EIA_DATASET_DATA_FIELDS = {
|
||||
"electricity/operating-generator-capacity": ["latitude", "longitude"],
|
||||
"electricity/facility-fuel": ["generation", "gross-generation"],
|
||||
"seds": ["value"],
|
||||
}
|
||||
|
||||
# Frequency for each endpoint. Drives how period range is discovered and how
|
||||
# pagination iterates. Endpoints not listed default to "monthly".
|
||||
EIA_DATASET_FREQUENCY = {
|
||||
"electricity/operating-generator-capacity": "monthly",
|
||||
"electricity/facility-fuel": "monthly",
|
||||
"seds": "annual",
|
||||
}
|
||||
|
||||
|
||||
def endpoint_frequency(endpoint: str) -> str:
|
||||
return EIA_DATASET_FREQUENCY.get(endpoint, "monthly")
|
||||
|
||||
# Endpoints that do not reliably support retry with ad-hoc data[] field requests.
|
||||
EIA_NO_RETRY_EXTRA_FIELDS = {
|
||||
}
|
||||
@@ -135,12 +157,28 @@ def iter_months(start: str, end: str):
|
||||
y += 1
|
||||
|
||||
|
||||
def discover_period_range(endpoint: str) -> tuple:
|
||||
"""Return (earliest, latest) 'YYYY-MM' period strings for an endpoint.
|
||||
def iter_years(start: str, end: str):
|
||||
"""Yield 'YYYY' strings from start to end inclusive."""
|
||||
sy = int(start[:4])
|
||||
ey = int(end[:4])
|
||||
for y in range(sy, ey + 1):
|
||||
yield f"{y:04d}"
|
||||
|
||||
Forces frequency=monthly so endpoints that also publish annual/quarterly
|
||||
series (e.g. facility-fuel) don't return non-monthly period formats that
|
||||
break iter_months. Routes through query_eia_api for retry/backoff coverage.
|
||||
|
||||
def iter_periods(frequency: str, start: str, end: str):
|
||||
if frequency == "annual":
|
||||
yield from iter_years(start, end)
|
||||
else:
|
||||
yield from iter_months(start, end)
|
||||
|
||||
|
||||
def discover_period_range(endpoint: str, frequency: str = "monthly") -> tuple:
|
||||
"""Return (earliest, latest) period strings for an endpoint.
|
||||
|
||||
Forces an explicit frequency so endpoints that publish multiple frequencies
|
||||
(e.g. facility-fuel monthly+annual) return periods in the expected format.
|
||||
Monthly endpoints get 'YYYY-MM'; annual endpoints (e.g. SEDS) get 'YYYY'.
|
||||
Routes through query_eia_api for retry/backoff coverage.
|
||||
"""
|
||||
def _one(direction: str) -> str:
|
||||
data = query_eia_api(
|
||||
@@ -150,7 +188,7 @@ def discover_period_range(endpoint: str) -> tuple:
|
||||
"sort[0][column]": "period",
|
||||
"sort[0][direction]": direction,
|
||||
},
|
||||
query_params={"frequency": "monthly"},
|
||||
query_params={"frequency": frequency},
|
||||
)
|
||||
rows = (data or {}).get("response", {}).get("data", [])
|
||||
if not rows:
|
||||
@@ -287,33 +325,35 @@ def fetch_eia_pages(
|
||||
offset += len(page_records)
|
||||
|
||||
|
||||
def fetch_eia_pages_by_month(
|
||||
def fetch_eia_pages_by_period(
|
||||
endpoint: str,
|
||||
frequency: str,
|
||||
earliest: str,
|
||||
latest: str,
|
||||
max_records: int = 0,
|
||||
extra_data_fields: Optional[List[str]] = None,
|
||||
) -> Any:
|
||||
"""Yield pages across months, querying one month at a time.
|
||||
"""Yield pages across periods, querying one period (month or year) at a time.
|
||||
|
||||
EIA's bulk endpoints serve large offsets slowly and return frequent 503s
|
||||
under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps
|
||||
each query small (~17k–28k rows per month for operating-generator-capacity)
|
||||
and dramatically reduces failure rate and wall time.
|
||||
under sustained load. Filtering by &frequency=F&start=X&end=X keeps each
|
||||
query small (~17k–28k rows per month for operating-generator-capacity,
|
||||
~40k rows per year for SEDS) and dramatically reduces failure rate and
|
||||
wall time.
|
||||
"""
|
||||
yielded = 0
|
||||
for month in iter_months(earliest, latest):
|
||||
for period in iter_periods(frequency, earliest, latest):
|
||||
if max_records > 0 and yielded >= max_records:
|
||||
return
|
||||
remaining = max_records - yielded if max_records > 0 else 0
|
||||
month_params = {"frequency": "monthly", "start": month, "end": month}
|
||||
period_params = {"frequency": frequency, "start": period, "end": period}
|
||||
for page_records, used_extra_fields in fetch_eia_pages(
|
||||
endpoint,
|
||||
max_records=remaining,
|
||||
extra_data_fields=extra_data_fields,
|
||||
query_params=month_params,
|
||||
query_params=period_params,
|
||||
):
|
||||
yield page_records, used_extra_fields, month
|
||||
yield page_records, used_extra_fields, period
|
||||
yielded += len(page_records)
|
||||
if max_records > 0 and yielded >= max_records:
|
||||
return
|
||||
@@ -363,26 +403,28 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i
|
||||
conn = connect_db()
|
||||
try:
|
||||
extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint)
|
||||
frequency = endpoint_frequency(dataset.api_endpoint)
|
||||
|
||||
earliest, latest = discover_period_range(dataset.api_endpoint)
|
||||
print(f" period range: {earliest} -> {latest}")
|
||||
earliest, latest = discover_period_range(dataset.api_endpoint, frequency)
|
||||
print(f" period range ({frequency}): {earliest} -> {latest}")
|
||||
|
||||
count = 0
|
||||
geo_count = 0
|
||||
initialized = False
|
||||
current_month: Optional[str] = None
|
||||
current_period: Optional[str] = None
|
||||
|
||||
for page_records, used_extra_fields, month in fetch_eia_pages_by_month(
|
||||
for page_records, used_extra_fields, period in fetch_eia_pages_by_period(
|
||||
dataset.api_endpoint,
|
||||
frequency=frequency,
|
||||
earliest=earliest,
|
||||
latest=latest,
|
||||
max_records=max_records,
|
||||
extra_data_fields=extra_fields,
|
||||
):
|
||||
if month != current_month:
|
||||
if current_month is not None:
|
||||
print(f" progress: {count} rows ingested through {current_month}")
|
||||
current_month = month
|
||||
if period != current_period:
|
||||
if current_period is not None:
|
||||
print(f" progress: {count} rows ingested through {current_period}")
|
||||
current_period = period
|
||||
if not initialized:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -780,7 +822,8 @@ def build_flat_tables(conn):
|
||||
where table_schema='public'
|
||||
and table_name in (
|
||||
'energy_eia_electricity_operating_generator_capacity',
|
||||
'energy_eia_electricity_facility_fuel'
|
||||
'energy_eia_electricity_facility_fuel',
|
||||
'energy_eia_seds'
|
||||
)
|
||||
"""
|
||||
)
|
||||
@@ -916,6 +959,49 @@ def build_flat_tables(conn):
|
||||
)
|
||||
cur.execute("analyze public.energy_eia_facility_fuel_flat")
|
||||
|
||||
if "energy_eia_seds" in available:
|
||||
# SEDS column mapping verified 2026-05-17 via length=5 probe of
|
||||
# https://api.eia.gov/v2/seds/data/. Confirmed keys: period (YYYY),
|
||||
# seriesId, seriesDescription, stateId, stateDescription, value, unit.
|
||||
# No sector field — sector is encoded in seriesId.
|
||||
cur.execute("drop table if exists public.energy_eia_seds_flat")
|
||||
cur.execute(
|
||||
r"""
|
||||
create table public.energy_eia_seds_flat as
|
||||
select
|
||||
gid,
|
||||
properties->>'period' as period,
|
||||
case
|
||||
when (properties->>'period') ~ '^[0-9]{4}$'
|
||||
then (properties->>'period')::integer
|
||||
end as year,
|
||||
properties->>'seriesId' as series_id,
|
||||
properties->>'seriesDescription' as series_description,
|
||||
properties->>'stateId' as state_id,
|
||||
properties->>'stateDescription' as state_name,
|
||||
case
|
||||
when (properties->>'value') ~ '^-?[0-9]+(\.[0-9]+)?$'
|
||||
then (properties->>'value')::double precision
|
||||
end as value,
|
||||
properties->>'unit' as unit,
|
||||
properties as raw_properties
|
||||
from public.energy_eia_seds
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"create index energy_eia_seds_flat_state_id_idx "
|
||||
"on public.energy_eia_seds_flat (state_id)"
|
||||
)
|
||||
cur.execute(
|
||||
"create index energy_eia_seds_flat_series_id_idx "
|
||||
"on public.energy_eia_seds_flat (series_id)"
|
||||
)
|
||||
cur.execute(
|
||||
"create index energy_eia_seds_flat_year_idx "
|
||||
"on public.energy_eia_seds_flat (year)"
|
||||
)
|
||||
cur.execute("analyze public.energy_eia_seds_flat")
|
||||
|
||||
|
||||
def prune_stale_layer_versions(conn) -> int:
|
||||
"""Drop superseded EIA layer tables and remove stale catalog rows.
|
||||
@@ -1034,6 +1120,7 @@ def prune_unselected_layers(conn, selected_table_names: List[str]) -> int:
|
||||
FINAL_FLAT_TABLES = (
|
||||
"energy_eia_operating_generator_capacity_flat",
|
||||
"energy_eia_facility_fuel_flat",
|
||||
"energy_eia_seds_flat",
|
||||
)
|
||||
|
||||
|
||||
@@ -1076,7 +1163,7 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--category",
|
||||
choices=["power", "all"],
|
||||
choices=["power", "state_energy", "all"],
|
||||
default="power",
|
||||
help="Infrastructure category to ingest.",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user