Add EIA SEDS ingestion support

2026-05-17 18:52:29 -07:00
parent 614b10b43f
commit 48f23af5b0
2 changed files with 152 additions and 26 deletions
--- a/.claude/MONDAY_CHECKLIST.md
+++ b/.claude/MONDAY_CHECKLIST.md
@@ -115,3 +115,42 @@ Update [`output/facility_fuel_pending_narrative.txt`](../output/facility_fuel_pe
 once facility-fuel is actually ingested. Replace the "Pending" framing with
 the real row count, period range, and any column-mapping notes from Step 3.
 Mirror the format of `operating_generator_capacity_sample.txt`.
+
+## New endpoint added — SEDS (State Energy Data System)
+
+Wired up 2026-05-17. Endpoint: `seds` (annual frequency,
+`https://api.eia.gov/v2/seds/data/`). Probed live, columns verified, smoke
+test of 50 rows landed in `public.energy_eia_seds_flat` with all typed
+columns populated.
+
+**Verified JSON keys** (no sector field — sector is encoded in `seriesId`):
+`period` (YYYY), `seriesId`, `seriesDescription`, `stateId`,
+`stateDescription`, `value`, `unit`.
+
+**Total volume:** ~2.57M rows across 65 years (1960–2024), ~40k rows/year.
+Ingested year-by-year via the generalized `fetch_eia_pages_by_period` to
+stay under EIA's 503 threshold (same pattern as the monthly endpoints).
+
+**What to verify Monday:**
+
+```sql
+-- Expected: ~2.5M+ rows, 1960 → 2024
+select count(*), min(year), max(year)
+from public.energy_eia_seds_flat;
+
+-- Spot-check that typed columns landed (not all NULL)
+select period, year, series_id, state_id, value, unit
+from public.energy_eia_seds_flat
+order by random()
+limit 5;
+```
+
+If row count is way under 2.5M, suspect a mid-run failure — check the log
+for `503` errors on the `seds` endpoint and re-run with
+`python3 ingest_eia_energy_layers.py --category state_energy --endpoint seds`.
+
+**Product/API docs for reference:**
+
+- Product page: https://www.eia.gov/state/seds/
+- Technical notes: https://www.eia.gov/state/seds/seds-technical-notes-complete.php
+- API documentation: https://www.eia.gov/opendata/documentation.php
--- a/ingest_eia_energy_layers.py
+++ b/ingest_eia_energy_layers.py
@@ -62,17 +62,39 @@ EIA_DATASETS = {
            "electricity/facility-fuel",
        ],
    },
+    "state_energy": {
+        "category": "state_energy",
+        "endpoints": [
+            # State Energy Data System (SEDS): annual state-level production,
+            # consumption, price, and expenditure across all energy sources.
+            "seds",
+        ],
+    },
 }

 # Extra data fields (the EIA `data[N]=` query params) each endpoint needs.
 # operating-generator-capacity returns only id columns by default; latitude/longitude
 # must be requested explicitly. facility-fuel returns only id columns; generation
-# values must be requested explicitly.
+# values must be requested explicitly. seds returns only id columns; the numeric
+# value column must be requested explicitly.
 EIA_DATASET_DATA_FIELDS = {
    "electricity/operating-generator-capacity": ["latitude", "longitude"],
    "electricity/facility-fuel": ["generation", "gross-generation"],
+    "seds": ["value"],
 }

+# Frequency for each endpoint. Drives how period range is discovered and how
+# pagination iterates. Endpoints not listed default to "monthly".
+EIA_DATASET_FREQUENCY = {
+    "electricity/operating-generator-capacity": "monthly",
+    "electricity/facility-fuel": "monthly",
+    "seds": "annual",
+}
+
+
+def endpoint_frequency(endpoint: str) -> str:
+    return EIA_DATASET_FREQUENCY.get(endpoint, "monthly")
+
 # Endpoints that do not reliably support retry with ad-hoc data[] field requests.
 EIA_NO_RETRY_EXTRA_FIELDS = {
 }
@@ -135,12 +157,28 @@ def iter_months(start: str, end: str):
            y += 1


-def discover_period_range(endpoint: str) -> tuple:
-    """Return (earliest, latest) 'YYYY-MM' period strings for an endpoint.
+def iter_years(start: str, end: str):
+    """Yield 'YYYY' strings from start to end inclusive."""
+    sy = int(start[:4])
+    ey = int(end[:4])
+    for y in range(sy, ey + 1):
+        yield f"{y:04d}"

-    Forces frequency=monthly so endpoints that also publish annual/quarterly
-    series (e.g. facility-fuel) don't return non-monthly period formats that
-    break iter_months. Routes through query_eia_api for retry/backoff coverage.
+
+def iter_periods(frequency: str, start: str, end: str):
+    if frequency == "annual":
+        yield from iter_years(start, end)
+    else:
+        yield from iter_months(start, end)
+
+
+def discover_period_range(endpoint: str, frequency: str = "monthly") -> tuple:
+    """Return (earliest, latest) period strings for an endpoint.
+
+    Forces an explicit frequency so endpoints that publish multiple frequencies
+    (e.g. facility-fuel monthly+annual) return periods in the expected format.
+    Monthly endpoints get 'YYYY-MM'; annual endpoints (e.g. SEDS) get 'YYYY'.
+    Routes through query_eia_api for retry/backoff coverage.
    """
    def _one(direction: str) -> str:
        data = query_eia_api(
@@ -150,7 +188,7 @@ def discover_period_range(endpoint: str) -> tuple:
                "sort[0][column]": "period",
                "sort[0][direction]": direction,
            },
-            query_params={"frequency": "monthly"},
+            query_params={"frequency": frequency},
        )
        rows = (data or {}).get("response", {}).get("data", [])
        if not rows:
@@ -287,33 +325,35 @@ def fetch_eia_pages(
        offset += len(page_records)


-def fetch_eia_pages_by_month(
+def fetch_eia_pages_by_period(
    endpoint: str,
+    frequency: str,
    earliest: str,
    latest: str,
    max_records: int = 0,
    extra_data_fields: Optional[List[str]] = None,
 ) -> Any:
-    """Yield pages across months, querying one month at a time.
+    """Yield pages across periods, querying one period (month or year) at a time.

    EIA's bulk endpoints serve large offsets slowly and return frequent 503s
-    under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps
-    each query small (~17k–28k rows per month for operating-generator-capacity)
-    and dramatically reduces failure rate and wall time.
+    under sustained load. Filtering by &frequency=F&start=X&end=X keeps each
+    query small (~17k–28k rows per month for operating-generator-capacity,
+    ~40k rows per year for SEDS) and dramatically reduces failure rate and
+    wall time.
    """
    yielded = 0
-    for month in iter_months(earliest, latest):
+    for period in iter_periods(frequency, earliest, latest):
        if max_records > 0 and yielded >= max_records:
            return
        remaining = max_records - yielded if max_records > 0 else 0
-        month_params = {"frequency": "monthly", "start": month, "end": month}
+        period_params = {"frequency": frequency, "start": period, "end": period}
        for page_records, used_extra_fields in fetch_eia_pages(
            endpoint,
            max_records=remaining,
            extra_data_fields=extra_data_fields,
-            query_params=month_params,
+            query_params=period_params,
        ):
-            yield page_records, used_extra_fields, month
+            yield page_records, used_extra_fields, period
            yielded += len(page_records)
            if max_records > 0 and yielded >= max_records:
                return
@@ -363,26 +403,28 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i
    conn = connect_db()
    try:
        extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint)
+        frequency = endpoint_frequency(dataset.api_endpoint)

-        earliest, latest = discover_period_range(dataset.api_endpoint)
-        print(f"  period range: {earliest} -> {latest}")
+        earliest, latest = discover_period_range(dataset.api_endpoint, frequency)
+        print(f"  period range ({frequency}): {earliest} -> {latest}")

        count = 0
        geo_count = 0
        initialized = False
-        current_month: Optional[str] = None
+        current_period: Optional[str] = None

-        for page_records, used_extra_fields, month in fetch_eia_pages_by_month(
+        for page_records, used_extra_fields, period in fetch_eia_pages_by_period(
            dataset.api_endpoint,
+            frequency=frequency,
            earliest=earliest,
            latest=latest,
            max_records=max_records,
            extra_data_fields=extra_fields,
        ):
-            if month != current_month:
-                if current_month is not None:
-                    print(f"  progress: {count} rows ingested through {current_month}")
-                current_month = month
+            if period != current_period:
+                if current_period is not None:
+                    print(f"  progress: {count} rows ingested through {current_period}")
+                current_period = period
            if not initialized:
                with conn:
                    with conn.cursor() as cur:
@@ -780,7 +822,8 @@ def build_flat_tables(conn):
            where table_schema='public'
              and table_name in (
                'energy_eia_electricity_operating_generator_capacity',
-                'energy_eia_electricity_facility_fuel'
+                'energy_eia_electricity_facility_fuel',
+                'energy_eia_seds'
              )
            """
        )
@@ -916,6 +959,49 @@ def build_flat_tables(conn):
                )
                cur.execute("analyze public.energy_eia_facility_fuel_flat")

+            if "energy_eia_seds" in available:
+                # SEDS column mapping verified 2026-05-17 via length=5 probe of
+                # https://api.eia.gov/v2/seds/data/. Confirmed keys: period (YYYY),
+                # seriesId, seriesDescription, stateId, stateDescription, value, unit.
+                # No sector field — sector is encoded in seriesId.
+                cur.execute("drop table if exists public.energy_eia_seds_flat")
+                cur.execute(
+                    r"""
+                    create table public.energy_eia_seds_flat as
+                    select
+                        gid,
+                        properties->>'period' as period,
+                        case
+                            when (properties->>'period') ~ '^[0-9]{4}$'
+                            then (properties->>'period')::integer
+                        end as year,
+                        properties->>'seriesId' as series_id,
+                        properties->>'seriesDescription' as series_description,
+                        properties->>'stateId' as state_id,
+                        properties->>'stateDescription' as state_name,
+                        case
+                            when (properties->>'value') ~ '^-?[0-9]+(\.[0-9]+)?$'
+                            then (properties->>'value')::double precision
+                        end as value,
+                        properties->>'unit' as unit,
+                        properties as raw_properties
+                    from public.energy_eia_seds
+                    """
+                )
+                cur.execute(
+                    "create index energy_eia_seds_flat_state_id_idx "
+                    "on public.energy_eia_seds_flat (state_id)"
+                )
+                cur.execute(
+                    "create index energy_eia_seds_flat_series_id_idx "
+                    "on public.energy_eia_seds_flat (series_id)"
+                )
+                cur.execute(
+                    "create index energy_eia_seds_flat_year_idx "
+                    "on public.energy_eia_seds_flat (year)"
+                )
+                cur.execute("analyze public.energy_eia_seds_flat")
+

 def prune_stale_layer_versions(conn) -> int:
    """Drop superseded EIA layer tables and remove stale catalog rows.
@@ -1034,6 +1120,7 @@ def prune_unselected_layers(conn, selected_table_names: List[str]) -> int:
 FINAL_FLAT_TABLES = (
    "energy_eia_operating_generator_capacity_flat",
    "energy_eia_facility_fuel_flat",
+    "energy_eia_seds_flat",
 )


@@ -1076,7 +1163,7 @@ def parse_args():
    )
    parser.add_argument(
        "--category",
-        choices=["power", "all"],
+        choices=["power", "state_energy", "all"],
        default="power",
        help="Infrastructure category to ingest.",
    )