Add utility rate tracker loader

This commit is contained in:
2026-05-17 18:52:55 -07:00
parent 48f23af5b0
commit 8fcbb18e37
3 changed files with 153 additions and 8 deletions

View File

@@ -3,3 +3,6 @@
.venv/
__pycache__/
_pycache__/
new/
internet_cables/
.claude/

4
.gitignore vendored
View File

@@ -74,3 +74,7 @@ ENV/
# OS files
.DS_Store
Thumbs.db
# Data csv, json, etc.
new/
internet_cables/

View File

@@ -348,15 +348,139 @@
"metadata": {},
"outputs": [],
"source": [
"# Example for the opposition cases table.\n",
"FILE_PATH = 'new/Opposition_Cases_Geocoded.csv'\n",
"TARGET_TABLE = 'public.opposition_cases_geocoded'\n",
"# Utility rate tracker loader: research-ready state-level table for data-center analyses.\n",
"FILE_PATH = 'new/MarchUtilityRateTrackerTable-Downloadable-Excel.xlsx'\n",
"SHEET_NAME = 'List of Utilities'\n",
"TARGET_TABLE = 'public.utility_rate_tracker_2025_2028'\n",
"IF_EXISTS = 'replace' # replace | append | fail\n",
"\n",
"# This file uses state abbreviations in the state column and includes lon/lat.\n",
"df = read_tabular(FILE_PATH)\n",
"STATE_NAME_TO_CODE = {\n",
" 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',\n",
" 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District Of Columbia': 'DC',\n",
" 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL',\n",
" 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',\n",
" 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',\n",
" 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',\n",
" 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',\n",
" 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',\n",
" 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',\n",
" 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',\n",
" 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',\n",
" 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'\n",
"}\n",
"\n",
"\n",
"def parse_effective_date(value):\n",
" if pd.isna(value):\n",
" return pd.NaT\n",
" text = str(value).strip()\n",
" if not text or text.upper() == 'N/A':\n",
" return pd.NaT\n",
"\n",
" numeric = pd.to_numeric(text, errors='coerce')\n",
" if pd.notna(numeric) and numeric >= 20000 and numeric <= 70000:\n",
" return pd.to_datetime(numeric, unit='D', origin='1899-12-30', errors='coerce')\n",
"\n",
" return pd.to_datetime(text, errors='coerce')\n",
"\n",
"\n",
"def detect_utility_header_row(raw_sheet: pd.DataFrame) -> int:\n",
" required = {'utility provider', 'state', 'electric or gas bill'}\n",
" max_scan = min(len(raw_sheet), 20)\n",
" for i in range(max_scan):\n",
" row_values = {\n",
" str(v).strip().lower()\n",
" for v in raw_sheet.iloc[i].tolist()\n",
" if pd.notna(v) and str(v).strip()\n",
" }\n",
" if required.issubset(row_values):\n",
" return i\n",
" raise ValueError('Could not detect utility table header row in workbook')\n",
"\n",
"\n",
"def build_utility_rate_tracker_frame(xlsx_path: str) -> pd.DataFrame:\n",
" # Read without a fixed header so we can detect the real header row robustly.\n",
" raw_sheet = pd.read_excel(\n",
" xlsx_path,\n",
" sheet_name=SHEET_NAME,\n",
" header=None,\n",
" engine='openpyxl',\n",
" )\n",
" header_row = detect_utility_header_row(raw_sheet)\n",
"\n",
" raw = pd.read_excel(\n",
" xlsx_path,\n",
" sheet_name=SHEET_NAME,\n",
" header=header_row,\n",
" engine='openpyxl',\n",
" )\n",
"\n",
" # Some files include leading blank/unnamed columns before the real table.\n",
" raw = raw.loc[:, ~raw.columns.astype(str).str.startswith('Unnamed')]\n",
"\n",
" source_to_target = {\n",
" 'Utility Provider': 'utility_provider',\n",
" 'State': 'state_name',\n",
" 'Electric or gas bill': 'service_type',\n",
" '# of customers': 'customer_count',\n",
" 'Total revenue increase, 20252028': 'total_revenue_increase_2025_2028',\n",
" 'Time Period': 'time_period',\n",
" 'Monthly increase amount': 'monthly_increase_amount',\n",
" 'Monthly % increase': 'monthly_pct_increase',\n",
" 'Effective date': 'effective_date_raw',\n",
" 'Status': 'status',\n",
" }\n",
" missing = [c for c in source_to_target if c not in raw.columns]\n",
" if missing:\n",
" raise ValueError(f'Missing expected utility columns: {missing}')\n",
"\n",
" raw = raw[list(source_to_target.keys())].rename(columns=source_to_target)\n",
"\n",
" for col in ['utility_provider', 'state_name', 'service_type', 'time_period', 'status']:\n",
" raw[col] = raw[col].astype(str).str.strip()\n",
"\n",
" raw = raw[raw['utility_provider'].notna() & (raw['utility_provider'] != '')]\n",
" raw = raw[raw['utility_provider'].str.lower() != 'nan']\n",
"\n",
" raw['state_name'] = raw['state_name'].str.title()\n",
" raw['state_code'] = raw['state_name'].map(STATE_NAME_TO_CODE)\n",
" raw['state_id'] = raw['state_code'].map(STATE_FIPS)\n",
"\n",
" raw['customer_count'] = pd.to_numeric(raw['customer_count'], errors='coerce')\n",
" raw['total_revenue_increase_2025_2028'] = pd.to_numeric(raw['total_revenue_increase_2025_2028'], errors='coerce')\n",
" raw['monthly_increase_amount'] = pd.to_numeric(raw['monthly_increase_amount'], errors='coerce')\n",
"\n",
" pct = pd.to_numeric(raw['monthly_pct_increase'], errors='coerce')\n",
" raw['monthly_pct_increase_ratio'] = pct.where((pct <= 1) | pct.isna(), pct / 100.0)\n",
"\n",
" raw['effective_date'] = raw['effective_date_raw'].apply(parse_effective_date)\n",
" raw['source_file'] = Path(xlsx_path).name\n",
"\n",
" ordered_cols = [\n",
" 'utility_provider',\n",
" 'state_name',\n",
" 'state_code',\n",
" 'state_id',\n",
" 'service_type',\n",
" 'customer_count',\n",
" 'total_revenue_increase_2025_2028',\n",
" 'time_period',\n",
" 'monthly_increase_amount',\n",
" 'monthly_pct_increase_ratio',\n",
" 'effective_date',\n",
" 'effective_date_raw',\n",
" 'status',\n",
" 'source_file',\n",
" ]\n",
" return raw[ordered_cols]\n",
"\n",
"\n",
"df = build_utility_rate_tracker_frame(FILE_PATH)\n",
"print('Rows:', len(df), 'Cols:', len(df.columns))\n",
"display(df[['id', 'state', 'state_id', 'lon', 'lat']].head(5))\n",
"print('States:', df['state_code'].nunique())\n",
"print('Service types:', sorted(df['service_type'].dropna().unique().tolist()))\n",
"print('Rows missing state_code:', int(df['state_code'].isna().sum()))\n",
"display(df.head(10))\n",
"\n",
"load_dataframe_to_postgis(df, TARGET_TABLE, if_exists=IF_EXISTS)"
]
@@ -368,8 +492,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Optional: this table has lon/lat columns, so build geometry with those names.\n",
"add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
"# QA: show any state names that did not map to USPS abbreviations.\n",
"unmatched = df[df['state_code'].isna()][['state_name', 'utility_provider', 'service_type', 'time_period', 'status']].copy()\n",
"print('Unmatched rows:', len(unmatched))\n",
"if len(unmatched):\n",
" display(unmatched.drop_duplicates().sort_values(['state_name', 'utility_provider']).reset_index(drop=True))"
]
},
{
@@ -378,6 +505,17 @@
"id": "9",
"metadata": {},
"outputs": [],
"source": [
"# Optional: this table has lon/lat columns, so build geometry with those names.\n",
"# add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10",
"metadata": {},
"outputs": [],
"source": [
"# Quick sanity check: show row count and latest tables in public schema.\n",
"with get_conn() as conn:\n",