From 8fcbb18e37d57efa98e8b274d507f10b41f9307b Mon Sep 17 00:00:00 2001 From: dadams Date: Sun, 17 May 2026 18:52:55 -0700 Subject: [PATCH] Add utility rate tracker loader --- .dropboxignore | 3 + .gitignore | 4 + postgis_table_loader.ipynb | 154 +++++++++++++++++++++++++++++++++++-- 3 files changed, 153 insertions(+), 8 deletions(-) diff --git a/.dropboxignore b/.dropboxignore index c912509..84e5620 100644 --- a/.dropboxignore +++ b/.dropboxignore @@ -3,3 +3,6 @@ .venv/ __pycache__/ _pycache__/ +new/ +internet_cables/ +.claude/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2b964dd..b809d00 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,7 @@ ENV/ # OS files .DS_Store Thumbs.db + +# Data csv, json, etc. +new/ +internet_cables/ \ No newline at end of file diff --git a/postgis_table_loader.ipynb b/postgis_table_loader.ipynb index 5e78491..5ef7f99 100644 --- a/postgis_table_loader.ipynb +++ b/postgis_table_loader.ipynb @@ -348,15 +348,139 @@ "metadata": {}, "outputs": [], "source": [ - "# Example for the opposition cases table.\n", - "FILE_PATH = 'new/Opposition_Cases_Geocoded.csv'\n", - "TARGET_TABLE = 'public.opposition_cases_geocoded'\n", + "# Utility rate tracker loader: research-ready state-level table for data-center analyses.\n", + "FILE_PATH = 'new/MarchUtilityRateTrackerTable-Downloadable-Excel.xlsx'\n", + "SHEET_NAME = 'List of Utilities'\n", + "TARGET_TABLE = 'public.utility_rate_tracker_2025_2028'\n", "IF_EXISTS = 'replace' # replace | append | fail\n", "\n", - "# This file uses state abbreviations in the state column and includes lon/lat.\n", - "df = read_tabular(FILE_PATH)\n", + "STATE_NAME_TO_CODE = {\n", + " 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',\n", + " 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District Of Columbia': 'DC',\n", + " 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL',\n", + " 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',\n", + " 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',\n", + " 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',\n", + " 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',\n", + " 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',\n", + " 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',\n", + " 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',\n", + " 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',\n", + " 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'\n", + "}\n", + "\n", + "\n", + "def parse_effective_date(value):\n", + " if pd.isna(value):\n", + " return pd.NaT\n", + " text = str(value).strip()\n", + " if not text or text.upper() == 'N/A':\n", + " return pd.NaT\n", + "\n", + " numeric = pd.to_numeric(text, errors='coerce')\n", + " if pd.notna(numeric) and numeric >= 20000 and numeric <= 70000:\n", + " return pd.to_datetime(numeric, unit='D', origin='1899-12-30', errors='coerce')\n", + "\n", + " return pd.to_datetime(text, errors='coerce')\n", + "\n", + "\n", + "def detect_utility_header_row(raw_sheet: pd.DataFrame) -> int:\n", + " required = {'utility provider', 'state', 'electric or gas bill'}\n", + " max_scan = min(len(raw_sheet), 20)\n", + " for i in range(max_scan):\n", + " row_values = {\n", + " str(v).strip().lower()\n", + " for v in raw_sheet.iloc[i].tolist()\n", + " if pd.notna(v) and str(v).strip()\n", + " }\n", + " if required.issubset(row_values):\n", + " return i\n", + " raise ValueError('Could not detect utility table header row in workbook')\n", + "\n", + "\n", + "def build_utility_rate_tracker_frame(xlsx_path: str) -> pd.DataFrame:\n", + " # Read without a fixed header so we can detect the real header row robustly.\n", + " raw_sheet = pd.read_excel(\n", + " xlsx_path,\n", + " sheet_name=SHEET_NAME,\n", + " header=None,\n", + " engine='openpyxl',\n", + " )\n", + " header_row = detect_utility_header_row(raw_sheet)\n", + "\n", + " raw = pd.read_excel(\n", + " xlsx_path,\n", + " sheet_name=SHEET_NAME,\n", + " header=header_row,\n", + " engine='openpyxl',\n", + " )\n", + "\n", + " # Some files include leading blank/unnamed columns before the real table.\n", + " raw = raw.loc[:, ~raw.columns.astype(str).str.startswith('Unnamed')]\n", + "\n", + " source_to_target = {\n", + " 'Utility Provider': 'utility_provider',\n", + " 'State': 'state_name',\n", + " 'Electric or gas bill': 'service_type',\n", + " '# of customers': 'customer_count',\n", + " 'Total revenue increase, 2025–2028': 'total_revenue_increase_2025_2028',\n", + " 'Time Period': 'time_period',\n", + " 'Monthly increase amount': 'monthly_increase_amount',\n", + " 'Monthly % increase': 'monthly_pct_increase',\n", + " 'Effective date': 'effective_date_raw',\n", + " 'Status': 'status',\n", + " }\n", + " missing = [c for c in source_to_target if c not in raw.columns]\n", + " if missing:\n", + " raise ValueError(f'Missing expected utility columns: {missing}')\n", + "\n", + " raw = raw[list(source_to_target.keys())].rename(columns=source_to_target)\n", + "\n", + " for col in ['utility_provider', 'state_name', 'service_type', 'time_period', 'status']:\n", + " raw[col] = raw[col].astype(str).str.strip()\n", + "\n", + " raw = raw[raw['utility_provider'].notna() & (raw['utility_provider'] != '')]\n", + " raw = raw[raw['utility_provider'].str.lower() != 'nan']\n", + "\n", + " raw['state_name'] = raw['state_name'].str.title()\n", + " raw['state_code'] = raw['state_name'].map(STATE_NAME_TO_CODE)\n", + " raw['state_id'] = raw['state_code'].map(STATE_FIPS)\n", + "\n", + " raw['customer_count'] = pd.to_numeric(raw['customer_count'], errors='coerce')\n", + " raw['total_revenue_increase_2025_2028'] = pd.to_numeric(raw['total_revenue_increase_2025_2028'], errors='coerce')\n", + " raw['monthly_increase_amount'] = pd.to_numeric(raw['monthly_increase_amount'], errors='coerce')\n", + "\n", + " pct = pd.to_numeric(raw['monthly_pct_increase'], errors='coerce')\n", + " raw['monthly_pct_increase_ratio'] = pct.where((pct <= 1) | pct.isna(), pct / 100.0)\n", + "\n", + " raw['effective_date'] = raw['effective_date_raw'].apply(parse_effective_date)\n", + " raw['source_file'] = Path(xlsx_path).name\n", + "\n", + " ordered_cols = [\n", + " 'utility_provider',\n", + " 'state_name',\n", + " 'state_code',\n", + " 'state_id',\n", + " 'service_type',\n", + " 'customer_count',\n", + " 'total_revenue_increase_2025_2028',\n", + " 'time_period',\n", + " 'monthly_increase_amount',\n", + " 'monthly_pct_increase_ratio',\n", + " 'effective_date',\n", + " 'effective_date_raw',\n", + " 'status',\n", + " 'source_file',\n", + " ]\n", + " return raw[ordered_cols]\n", + "\n", + "\n", + "df = build_utility_rate_tracker_frame(FILE_PATH)\n", "print('Rows:', len(df), 'Cols:', len(df.columns))\n", - "display(df[['id', 'state', 'state_id', 'lon', 'lat']].head(5))\n", + "print('States:', df['state_code'].nunique())\n", + "print('Service types:', sorted(df['service_type'].dropna().unique().tolist()))\n", + "print('Rows missing state_code:', int(df['state_code'].isna().sum()))\n", + "display(df.head(10))\n", "\n", "load_dataframe_to_postgis(df, TARGET_TABLE, if_exists=IF_EXISTS)" ] @@ -368,8 +492,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Optional: this table has lon/lat columns, so build geometry with those names.\n", - "add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)" + "# QA: show any state names that did not map to USPS abbreviations.\n", + "unmatched = df[df['state_code'].isna()][['state_name', 'utility_provider', 'service_type', 'time_period', 'status']].copy()\n", + "print('Unmatched rows:', len(unmatched))\n", + "if len(unmatched):\n", + " display(unmatched.drop_duplicates().sort_values(['state_name', 'utility_provider']).reset_index(drop=True))" ] }, { @@ -378,6 +505,17 @@ "id": "9", "metadata": {}, "outputs": [], + "source": [ + "# Optional: this table has lon/lat columns, so build geometry with those names.\n", + "# add_point_geometry(TARGET_TABLE, lon_col='lon', lat_col='lat', geom_col='geom', srid=4326)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], "source": [ "# Quick sanity check: show row count and latest tables in public schema.\n", "with get_conn() as conn:\n",