Add utility rate tracker loader
This commit is contained in:
@@ -348,15 +348,139 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example for the opposition cases table.\n",
|
||||
"FILE_PATH = 'new/Opposition_Cases_Geocoded.csv'\n",
|
||||
"TARGET_TABLE = 'public.opposition_cases_geocoded'\n",
|
||||
"# Utility rate tracker loader: research-ready state-level table for data-center analyses.\n",
|
||||
"FILE_PATH = 'new/MarchUtilityRateTrackerTable-Downloadable-Excel.xlsx'\n",
|
||||
"SHEET_NAME = 'List of Utilities'\n",
|
||||
"TARGET_TABLE = 'public.utility_rate_tracker_2025_2028'\n",
|
||||
"IF_EXISTS = 'replace' # replace | append | fail\n",
|
||||
"\n",
|
||||
"# This file uses state abbreviations in the state column and includes lon/lat.\n",
|
||||
"df = read_tabular(FILE_PATH)\n",
|
||||
"STATE_NAME_TO_CODE = {\n",
|
||||
" 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',\n",
|
||||
" 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District Of Columbia': 'DC',\n",
|
||||
" 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL',\n",
|
||||
" 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',\n",
|
||||
" 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',\n",
|
||||
" 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',\n",
|
||||
" 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',\n",
|
||||
" 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',\n",
|
||||
" 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',\n",
|
||||
" 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',\n",
|
||||
" 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',\n",
|
||||
" 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def parse_effective_date(value):\n",
|
||||
" if pd.isna(value):\n",
|
||||
" return pd.NaT\n",
|
||||
" text = str(value).strip()\n",
|
||||
" if not text or text.upper() == 'N/A':\n",
|
||||
" return pd.NaT\n",
|
||||
"\n",
|
||||
" numeric = pd.to_numeric(text, errors='coerce')\n",
|
||||
" if pd.notna(numeric) and numeric >= 20000 and numeric <= 70000:\n",
|
||||
" return pd.to_datetime(numeric, unit='D', origin='1899-12-30', errors='coerce')\n",
|
||||
"\n",
|
||||
" return pd.to_datetime(text, errors='coerce')\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def detect_utility_header_row(raw_sheet: pd.DataFrame) -> int:\n",
|
||||
" required = {'utility provider', 'state', 'electric or gas bill'}\n",
|
||||
" max_scan = min(len(raw_sheet), 20)\n",
|
||||
" for i in range(max_scan):\n",
|
||||
" row_values = {\n",
|
||||
" str(v).strip().lower()\n",
|
||||
" for v in raw_sheet.iloc[i].tolist()\n",
|
||||
" if pd.notna(v) and str(v).strip()\n",
|
||||
" }\n",
|
||||
" if required.issubset(row_values):\n",
|
||||
" return i\n",
|
||||
" raise ValueError('Could not detect utility table header row in workbook')\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def build_utility_rate_tracker_frame(xlsx_path: str) -> pd.DataFrame:\n",
|
||||
" # Read without a fixed header so we can detect the real header row robustly.\n",
|
||||
" raw_sheet = pd.read_excel(\n",
|
||||
" xlsx_path,\n",
|
||||
" sheet_name=SHEET_NAME,\n",
|
||||
" header=None,\n",
|
||||
" engine='openpyxl',\n",
|
||||
" )\n",
|
||||
" header_row = detect_utility_header_row(raw_sheet)\n",
|
||||
"\n",
|
||||
" raw = pd.read_excel(\n",
|
||||
" xlsx_path,\n",
|
||||
" sheet_name=SHEET_NAME,\n",
|
||||
" header=header_row,\n",
|
||||
" engine='openpyxl',\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Some files include leading blank/unnamed columns before the real table.\n",
|
||||
" raw = raw.loc[:, ~raw.columns.astype(str).str.startswith('Unnamed')]\n",
|
||||
"\n",
|
||||
" source_to_target = {\n",
|
||||
" 'Utility Provider': 'utility_provider',\n",
|
||||
" 'State': 'state_name',\n",
|
||||
" 'Electric or gas bill': 'service_type',\n",
|
||||
" '# of customers': 'customer_count',\n",
|
||||
" 'Total revenue increase, 2025–2028': 'total_revenue_increase_2025_2028',\n",
|
||||
" 'Time Period': 'time_period',\n",
|
||||
" 'Monthly increase amount': 'monthly_increase_amount',\n",
|
||||
" 'Monthly % increase': 'monthly_pct_increase',\n",
|
||||
" 'Effective date': 'effective_date_raw',\n",
|
||||
" 'Status': 'status',\n",
|
||||
" }\n",
|
||||
" missing = [c for c in source_to_target if c not in raw.columns]\n",
|
||||
" if missing:\n",
|
||||
" raise ValueError(f'Missing expected utility columns: {missing}')\n",
|
||||
"\n",
|
||||
" raw = raw[list(source_to_target.keys())].rename(columns=source_to_target)\n",
|
||||
"\n",
|
||||
" for col in ['utility_provider', 'state_name', 'service_type', 'time_period', 'status']:\n",
|
||||
" raw[col] = raw[col].astype(str).str.strip()\n",
|
||||
"\n",
|
||||
" raw = raw[raw['utility_provider'].notna() & (raw['utility_provider'] != '')]\n",
|
||||
" raw = raw[raw['utility_provider'].str.lower() != 'nan']\n",
|
||||
"\n",
|
||||
" raw['state_name'] = raw['state_name'].str.title()\n",
|
||||
" raw['state_code'] = raw['state_name'].map(STATE_NAME_TO_CODE)\n",
|
||||
" raw['state_id'] = raw['state_code'].map(STATE_FIPS)\n",
|
||||
"\n",
|
||||
" raw['customer_count'] = pd.to_numeric(raw['customer_count'], errors='coerce')\n",
|
||||
" raw['total_revenue_increase_2025_2028'] = pd.to_numeric(raw['total_revenue_increase_2025_2028'], errors='coerce')\n",
|
||||
" raw['monthly_increase_amount'] = pd.to_numeric(raw['monthly_increase_amount'], errors='coerce')\n",
|
||||
"\n",
|
||||
" pct = pd.to_numeric(raw['monthly_pct_increase'], errors='coerce')\n",
|
||||
" raw['monthly_pct_increase_ratio'] = pct.where((pct <= 1) | pct.isna(), pct / 100.0)\n",
|
||||
"\n",
|
||||
" raw['effective_date'] = raw['effective_date_raw'].apply(parse_effective_date)\n",
|
||||
" raw['source_file'] = Path(xlsx_path).name\n",
|
||||
"\n",
|
||||
" ordered_cols = [\n",
|
||||
" 'utility_provider',\n",
|
||||
" 'state_name',\n",
|
||||
" 'state_code',\n",
|
||||
" 'state_id',\n",
|
||||
" 'service_type',\n",
|
||||
" 'customer_count',\n",
|
||||
" 'total_revenue_increase_2025_2028',\n",
|
||||
" 'time_period',\n",
|
||||
" 'monthly_increase_amount',\n",
|
||||
" 'monthly_pct_increase_ratio',\n",
|
||||
" 'effective_date',\n",
|
||||
" 'effective_date_raw',\n",
|
||||
" 'status',\n",
|
||||
" 'source_file',\n",
|
||||
" ]\n",
|
||||
" return raw[ordered_cols]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df = build_utility_rate_tracker_frame(FILE_PATH)\n",
|
||||
"print('Rows:', len(df), 'Cols:', len(df.columns))\n",
|
||||
"display(df[['id', 'state', 'state_id', 'lon', 'lat']].head(5))\n",
|
||||
"print('States:', df['state_code'].nunique())\n",
|
||||
"print('Service types:', sorted(df['service_type'].dropna().unique().tolist()))\n",
|
||||
"print('Rows missing state_code:', int(df['state_code'].isna().sum()))\n",
|
||||
"display(df.head(10))\n",
|
||||
"\n",
|
||||
"load_dataframe_to_postgis(df, TARGET_TABLE, if_exists=IF_EXISTS)"
|
||||
]
|
||||
@@ -368,8 +492,11 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional: this table has lon/lat columns, so build geometry with those names.\n",
|
||||
"add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
|
||||
"# QA: show any state names that did not map to USPS abbreviations.\n",
|
||||
"unmatched = df[df['state_code'].isna()][['state_name', 'utility_provider', 'service_type', 'time_period', 'status']].copy()\n",
|
||||
"print('Unmatched rows:', len(unmatched))\n",
|
||||
"if len(unmatched):\n",
|
||||
" display(unmatched.drop_duplicates().sort_values(['state_name', 'utility_provider']).reset_index(drop=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -378,6 +505,17 @@
|
||||
"id": "9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional: this table has lon/lat columns, so build geometry with those names.\n",
|
||||
"# add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Quick sanity check: show row count and latest tables in public schema.\n",
|
||||
"with get_conn() as conn:\n",
|
||||
|
||||
Reference in New Issue
Block a user