data-centers/scripts/crosstab_operator_industry.py

#!/usr/bin/env python3
"""Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry.

For each enrichment geography (ZCTA and census tract), unnests the `operators`
array, normalizes operator names to merge known variants, and writes a
operator x primary_industry crosstab to output/.

`primary_industry` is the leading *workforce* industry of the geography
(derived from ACS 2024 employment), not the data center's line of business.
Counts are operator-appearances per geography unit (one tally per geo an
operator appears in).
"""
import os
import re
import csv
from pathlib import Path
from collections import defaultdict

import psycopg2

PROJECT_ROOT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = PROJECT_ROOT / "output"

GEOGRAPHIES = {
    "zcta": "data_center_zcta_2024",
    "census_tract": "data_center_census_tracts_2024",
}


def connect():
    return psycopg2.connect(
        host=os.environ["PGWEB_HOST"],
        port=os.environ["PGWEB_PORT"],
        user=os.environ["PGWEB_USER"],
        password=os.environ["PGWEB_PASSWORD"],
        dbname="data_centers",
    )


# Exact-match canonicalization of operator-name variants -> canonical name.
# Only merges that are unambiguously the same company.
ALIASES = {
    # Amazon
    "Amazon": "Amazon Web Services",
    "Amazon AWS": "Amazon Web Services",
    "Amazon Web Serivces": "Amazon Web Services",
    "amazon web services": "Amazon Web Services",
    "Amazon IAD75": "Amazon Web Services",
    "Amazon IAD85": "Amazon Web Services",
    "Amazon IAD95": "Amazon Web Services",
    "Amazon IAD96": "Amazon Web Services",
    "Amazon Web Services us-east-2 datacenter": "Amazon Web Services",
    # Aligned
    "Aligned Data Centers": "Aligned",
    "Aligned Data Centers, LLC": "Aligned",
    # Apple
    "Apple Inc.": "Apple",
    # CenturyLink (case)
    "Centurylink": "CenturyLink",
    # Cogent
    "Cogent Communications, Inc.": "Cogent Communications",
    # CoreSite
    "CoreSite Data Center": "CoreSite",
    "CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite",
    # DataBank
    "Databank": "DataBank",
    # Digital Fortress
    "Digital Fortress, Inc": "Digital Fortress",
    # EdgeConneX
    "EdgeConnex": "EdgeConneX",
    # Google
    "Google LLC": "Google",
    # H5
    "H5 Data Centers": "H5",
    # Lumen (incl. former CenturyLink brand kept separate)
    "Lumen Technologies": "Lumen",
    # Meta
    "Meta, Inc.": "Meta",
    "Facebook": "Meta",
    # Microsoft
    "Microsoft Azure": "Microsoft",
    # NTT
    "NTT Global Data Centers": "NTT",
    "NTT Limited": "NTT",
    # Prime
    "Prime": "Prime Data Centers",
    # QTS
    "QTS Data Centers": "Quality Technology Services",
    # Sabey
    "Sabey Data Centers": "Sabey",
    # Serverfarm
    "Serverfarm LLC.": "Serverfarm",
    # Stack
    "Stack Infrastructure, Incorporated": "Stack Infrastructure",
    # Stream
    "Stream Data Centers": "Stream",
    # T5
    "T5 Data Centers": "T5",
    # Tata
    "TATA Communications": "Tata Communications",
    # Vantage
    "Vantage Data Centers": "Vantage",
    # Verizon
    "Verizon Business": "Verizon",
    "Verizon Wireless": "Verizon",
    # 1547 / fifteenfortyseven
    "fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty",
}


def normalize(op: str) -> str:
    op = re.sub(r"\s+", " ", op).strip()
    return ALIASES.get(op, op)


def fetch(conn, table):
    """Return list of (normalized_operator, primary_industry) appearance rows."""
    rows = []
    with conn.cursor() as cur:
        cur.execute(
            f"SELECT primary_industry, unnest(operators) AS operator "
            f"FROM {table} WHERE operators IS NOT NULL"
        )
        for primary_industry, operator in cur.fetchall():
            if operator is None:
                continue
            rows.append((normalize(operator), primary_industry or "(unknown)"))
    return rows


def build_crosstab(rows):
    industries = sorted({pi for _, pi in rows})
    counts = defaultdict(lambda: defaultdict(int))
    totals = defaultdict(int)
    for op, pi in rows:
        counts[op][pi] += 1
        totals[op] += 1
    operators = sorted(counts, key=lambda o: (-totals[o], o))
    return operators, industries, counts, totals


def write_csv(path, operators, industries, counts, totals):
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["operator", "total"] + industries)
        for op in operators:
            w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries])
        # column totals row
        col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries]
        w.writerow(["ALL", sum(totals.values())] + col_totals)


def write_csv_rowpct(path, operators, industries, counts, totals):
    """Same crosstab, each cell as a percent of the operator's row total."""
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries])
        for op in operators:
            tot = totals[op]
            pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries]
            w.writerow([op, tot] + pcts)


def main():
    OUTPUT_DIR.mkdir(exist_ok=True)
    conn = connect()
    try:
        for label, table in GEOGRAPHIES.items():
            rows = fetch(conn, table)
            operators, industries, counts, totals = build_crosstab(rows)
            out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv"
            out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv"
            write_csv(out, operators, industries, counts, totals)
            write_csv_rowpct(out_pct, operators, industries, counts, totals)
            print(
                f"{label}: {len(operators)} operators x {len(industries)} industries, "
                f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}"
            )
    finally:
        conn.close()


if __name__ == "__main__":
    main()