Add utility rate tracker loader

2026-05-17 18:52:55 -07:00
parent 48f23af5b0
commit 8fcbb18e37
3 changed files with 153 additions and 8 deletions
--- a/.dropboxignore
+++ b/.dropboxignore
@@ -3,3 +3,6 @@
 .venv/
 __pycache__/
 _pycache__/
+new/
+internet_cables/
+.claude/
--- a/.gitignore
+++ b/.gitignore
@@ -74,3 +74,7 @@ ENV/
 # OS files
 .DS_Store
 Thumbs.db
+
+# Data csv, json, etc.
+new/
+internet_cables/
--- a/postgis_table_loader.ipynb
+++ b/postgis_table_loader.ipynb
@@ -348,15 +348,139 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Example for the opposition cases table.\n",
-    "FILE_PATH = 'new/Opposition_Cases_Geocoded.csv'\n",
-    "TARGET_TABLE = 'public.opposition_cases_geocoded'\n",
+    "# Utility rate tracker loader: research-ready state-level table for data-center analyses.\n",
+    "FILE_PATH = 'new/MarchUtilityRateTrackerTable-Downloadable-Excel.xlsx'\n",
+    "SHEET_NAME = 'List of Utilities'\n",
+    "TARGET_TABLE = 'public.utility_rate_tracker_2025_2028'\n",
    "IF_EXISTS = 'replace'  # replace | append | fail\n",
    "\n",
-    "# This file uses state abbreviations in the state column and includes lon/lat.\n",
-    "df = read_tabular(FILE_PATH)\n",
+    "STATE_NAME_TO_CODE = {\n",
+    "    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',\n",
+    "    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District Of Columbia': 'DC',\n",
+    "    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL',\n",
+    "    'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',\n",
+    "    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',\n",
+    "    'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',\n",
+    "    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',\n",
+    "    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',\n",
+    "    'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',\n",
+    "    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',\n",
+    "    'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',\n",
+    "    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def parse_effective_date(value):\n",
+    "    if pd.isna(value):\n",
+    "        return pd.NaT\n",
+    "    text = str(value).strip()\n",
+    "    if not text or text.upper() == 'N/A':\n",
+    "        return pd.NaT\n",
+    "\n",
+    "    numeric = pd.to_numeric(text, errors='coerce')\n",
+    "    if pd.notna(numeric) and numeric >= 20000 and numeric <= 70000:\n",
+    "        return pd.to_datetime(numeric, unit='D', origin='1899-12-30', errors='coerce')\n",
+    "\n",
+    "    return pd.to_datetime(text, errors='coerce')\n",
+    "\n",
+    "\n",
+    "def detect_utility_header_row(raw_sheet: pd.DataFrame) -> int:\n",
+    "    required = {'utility provider', 'state', 'electric or gas bill'}\n",
+    "    max_scan = min(len(raw_sheet), 20)\n",
+    "    for i in range(max_scan):\n",
+    "        row_values = {\n",
+    "            str(v).strip().lower()\n",
+    "            for v in raw_sheet.iloc[i].tolist()\n",
+    "            if pd.notna(v) and str(v).strip()\n",
+    "        }\n",
+    "        if required.issubset(row_values):\n",
+    "            return i\n",
+    "    raise ValueError('Could not detect utility table header row in workbook')\n",
+    "\n",
+    "\n",
+    "def build_utility_rate_tracker_frame(xlsx_path: str) -> pd.DataFrame:\n",
+    "    # Read without a fixed header so we can detect the real header row robustly.\n",
+    "    raw_sheet = pd.read_excel(\n",
+    "        xlsx_path,\n",
+    "        sheet_name=SHEET_NAME,\n",
+    "        header=None,\n",
+    "        engine='openpyxl',\n",
+    "    )\n",
+    "    header_row = detect_utility_header_row(raw_sheet)\n",
+    "\n",
+    "    raw = pd.read_excel(\n",
+    "        xlsx_path,\n",
+    "        sheet_name=SHEET_NAME,\n",
+    "        header=header_row,\n",
+    "        engine='openpyxl',\n",
+    "    )\n",
+    "\n",
+    "    # Some files include leading blank/unnamed columns before the real table.\n",
+    "    raw = raw.loc[:, ~raw.columns.astype(str).str.startswith('Unnamed')]\n",
+    "\n",
+    "    source_to_target = {\n",
+    "        'Utility Provider': 'utility_provider',\n",
+    "        'State': 'state_name',\n",
+    "        'Electric or gas bill': 'service_type',\n",
+    "        '# of customers': 'customer_count',\n",
+    "        'Total revenue increase, 2025–2028': 'total_revenue_increase_2025_2028',\n",
+    "        'Time Period': 'time_period',\n",
+    "        'Monthly increase amount': 'monthly_increase_amount',\n",
+    "        'Monthly % increase': 'monthly_pct_increase',\n",
+    "        'Effective date': 'effective_date_raw',\n",
+    "        'Status': 'status',\n",
+    "    }\n",
+    "    missing = [c for c in source_to_target if c not in raw.columns]\n",
+    "    if missing:\n",
+    "        raise ValueError(f'Missing expected utility columns: {missing}')\n",
+    "\n",
+    "    raw = raw[list(source_to_target.keys())].rename(columns=source_to_target)\n",
+    "\n",
+    "    for col in ['utility_provider', 'state_name', 'service_type', 'time_period', 'status']:\n",
+    "        raw[col] = raw[col].astype(str).str.strip()\n",
+    "\n",
+    "    raw = raw[raw['utility_provider'].notna() & (raw['utility_provider'] != '')]\n",
+    "    raw = raw[raw['utility_provider'].str.lower() != 'nan']\n",
+    "\n",
+    "    raw['state_name'] = raw['state_name'].str.title()\n",
+    "    raw['state_code'] = raw['state_name'].map(STATE_NAME_TO_CODE)\n",
+    "    raw['state_id'] = raw['state_code'].map(STATE_FIPS)\n",
+    "\n",
+    "    raw['customer_count'] = pd.to_numeric(raw['customer_count'], errors='coerce')\n",
+    "    raw['total_revenue_increase_2025_2028'] = pd.to_numeric(raw['total_revenue_increase_2025_2028'], errors='coerce')\n",
+    "    raw['monthly_increase_amount'] = pd.to_numeric(raw['monthly_increase_amount'], errors='coerce')\n",
+    "\n",
+    "    pct = pd.to_numeric(raw['monthly_pct_increase'], errors='coerce')\n",
+    "    raw['monthly_pct_increase_ratio'] = pct.where((pct <= 1) | pct.isna(), pct / 100.0)\n",
+    "\n",
+    "    raw['effective_date'] = raw['effective_date_raw'].apply(parse_effective_date)\n",
+    "    raw['source_file'] = Path(xlsx_path).name\n",
+    "\n",
+    "    ordered_cols = [\n",
+    "        'utility_provider',\n",
+    "        'state_name',\n",
+    "        'state_code',\n",
+    "        'state_id',\n",
+    "        'service_type',\n",
+    "        'customer_count',\n",
+    "        'total_revenue_increase_2025_2028',\n",
+    "        'time_period',\n",
+    "        'monthly_increase_amount',\n",
+    "        'monthly_pct_increase_ratio',\n",
+    "        'effective_date',\n",
+    "        'effective_date_raw',\n",
+    "        'status',\n",
+    "        'source_file',\n",
+    "    ]\n",
+    "    return raw[ordered_cols]\n",
+    "\n",
+    "\n",
+    "df = build_utility_rate_tracker_frame(FILE_PATH)\n",
    "print('Rows:', len(df), 'Cols:', len(df.columns))\n",
-    "display(df[['id', 'state', 'state_id', 'lon', 'lat']].head(5))\n",
+    "print('States:', df['state_code'].nunique())\n",
+    "print('Service types:', sorted(df['service_type'].dropna().unique().tolist()))\n",
+    "print('Rows missing state_code:', int(df['state_code'].isna().sum()))\n",
+    "display(df.head(10))\n",
    "\n",
    "load_dataframe_to_postgis(df, TARGET_TABLE, if_exists=IF_EXISTS)"
   ]
@@ -368,8 +492,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Optional: this table has lon/lat columns, so build geometry with those names.\n",
-    "add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
+    "# QA: show any state names that did not map to USPS abbreviations.\n",
+    "unmatched = df[df['state_code'].isna()][['state_name', 'utility_provider', 'service_type', 'time_period', 'status']].copy()\n",
+    "print('Unmatched rows:', len(unmatched))\n",
+    "if len(unmatched):\n",
+    "    display(unmatched.drop_duplicates().sort_values(['state_name', 'utility_provider']).reset_index(drop=True))"
   ]
  },
  {
@@ -378,6 +505,17 @@
   "id": "9",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "# Optional: this table has lon/lat columns, so build geometry with those names.\n",
+    "# add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# Quick sanity check: show row count and latest tables in public schema.\n",
    "with get_conn() as conn:\n",