Add operator x dominant workforce industry crosstab script and CSVs

Cross-tabs normalized data-center operator (owner) against the leading ACS 2024 workforce industry of each enrichment geography (ZCTA and census tract). Emits raw-count and row-percentage CSVs for both geographies. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 20:57:58 -07:00
parent 6cb7c0334c
commit 2c50c969bf
5 changed files with 1112 additions and 0 deletions
--- a/scripts/crosstab_operator_industry.py
+++ b/scripts/crosstab_operator_industry.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry.
+
+For each enrichment geography (ZCTA and census tract), unnests the `operators`
+array, normalizes operator names to merge known variants, and writes a
+operator x primary_industry crosstab to output/.
+
+`primary_industry` is the leading *workforce* industry of the geography
+(derived from ACS 2024 employment), not the data center's line of business.
+Counts are operator-appearances per geography unit (one tally per geo an
+operator appears in).
+"""
+import os
+import re
+import csv
+from pathlib import Path
+from collections import defaultdict
+
+import psycopg2
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+OUTPUT_DIR = PROJECT_ROOT / "output"
+
+GEOGRAPHIES = {
+    "zcta": "data_center_zcta_2024",
+    "census_tract": "data_center_census_tracts_2024",
+}
+
+
+def connect():
+    return psycopg2.connect(
+        host=os.environ["PGWEB_HOST"],
+        port=os.environ["PGWEB_PORT"],
+        user=os.environ["PGWEB_USER"],
+        password=os.environ["PGWEB_PASSWORD"],
+        dbname="data_centers",
+    )
+
+
+# Exact-match canonicalization of operator-name variants -> canonical name.
+# Only merges that are unambiguously the same company.
+ALIASES = {
+    # Amazon
+    "Amazon": "Amazon Web Services",
+    "Amazon AWS": "Amazon Web Services",
+    "Amazon Web Serivces": "Amazon Web Services",
+    "amazon web services": "Amazon Web Services",
+    "Amazon IAD75": "Amazon Web Services",
+    "Amazon IAD85": "Amazon Web Services",
+    "Amazon IAD95": "Amazon Web Services",
+    "Amazon IAD96": "Amazon Web Services",
+    "Amazon Web Services us-east-2 datacenter": "Amazon Web Services",
+    # Aligned
+    "Aligned Data Centers": "Aligned",
+    "Aligned Data Centers, LLC": "Aligned",
+    # Apple
+    "Apple Inc.": "Apple",
+    # CenturyLink (case)
+    "Centurylink": "CenturyLink",
+    # Cogent
+    "Cogent Communications, Inc.": "Cogent Communications",
+    # CoreSite
+    "CoreSite Data Center": "CoreSite",
+    "CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite",
+    # DataBank
+    "Databank": "DataBank",
+    # Digital Fortress
+    "Digital Fortress, Inc": "Digital Fortress",
+    # EdgeConneX
+    "EdgeConnex": "EdgeConneX",
+    # Google
+    "Google LLC": "Google",
+    # H5
+    "H5 Data Centers": "H5",
+    # Lumen (incl. former CenturyLink brand kept separate)
+    "Lumen Technologies": "Lumen",
+    # Meta
+    "Meta, Inc.": "Meta",
+    "Facebook": "Meta",
+    # Microsoft
+    "Microsoft Azure": "Microsoft",
+    # NTT
+    "NTT Global Data Centers": "NTT",
+    "NTT Limited": "NTT",
+    # Prime
+    "Prime": "Prime Data Centers",
+    # QTS
+    "QTS Data Centers": "Quality Technology Services",
+    # Sabey
+    "Sabey Data Centers": "Sabey",
+    # Serverfarm
+    "Serverfarm LLC.": "Serverfarm",
+    # Stack
+    "Stack Infrastructure, Incorporated": "Stack Infrastructure",
+    # Stream
+    "Stream Data Centers": "Stream",
+    # T5
+    "T5 Data Centers": "T5",
+    # Tata
+    "TATA Communications": "Tata Communications",
+    # Vantage
+    "Vantage Data Centers": "Vantage",
+    # Verizon
+    "Verizon Business": "Verizon",
+    "Verizon Wireless": "Verizon",
+    # 1547 / fifteenfortyseven
+    "fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty",
+}
+
+
+def normalize(op: str) -> str:
+    op = re.sub(r"\s+", " ", op).strip()
+    return ALIASES.get(op, op)
+
+
+def fetch(conn, table):
+    """Return list of (normalized_operator, primary_industry) appearance rows."""
+    rows = []
+    with conn.cursor() as cur:
+        cur.execute(
+            f"SELECT primary_industry, unnest(operators) AS operator "
+            f"FROM {table} WHERE operators IS NOT NULL"
+        )
+        for primary_industry, operator in cur.fetchall():
+            if operator is None:
+                continue
+            rows.append((normalize(operator), primary_industry or "(unknown)"))
+    return rows
+
+
+def build_crosstab(rows):
+    industries = sorted({pi for _, pi in rows})
+    counts = defaultdict(lambda: defaultdict(int))
+    totals = defaultdict(int)
+    for op, pi in rows:
+        counts[op][pi] += 1
+        totals[op] += 1
+    operators = sorted(counts, key=lambda o: (-totals[o], o))
+    return operators, industries, counts, totals
+
+
+def write_csv(path, operators, industries, counts, totals):
+    with open(path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["operator", "total"] + industries)
+        for op in operators:
+            w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries])
+        # column totals row
+        col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries]
+        w.writerow(["ALL", sum(totals.values())] + col_totals)
+
+
+def write_csv_rowpct(path, operators, industries, counts, totals):
+    """Same crosstab, each cell as a percent of the operator's row total."""
+    with open(path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries])
+        for op in operators:
+            tot = totals[op]
+            pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries]
+            w.writerow([op, tot] + pcts)
+
+
+def main():
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    conn = connect()
+    try:
+        for label, table in GEOGRAPHIES.items():
+            rows = fetch(conn, table)
+            operators, industries, counts, totals = build_crosstab(rows)
+            out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv"
+            out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv"
+            write_csv(out, operators, industries, counts, totals)
+            write_csv_rowpct(out_pct, operators, industries, counts, totals)
+            print(
+                f"{label}: {len(operators)} operators x {len(industries)} industries, "
+                f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}"
+            )
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()