#!/usr/bin/env python3 """Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry. For each enrichment geography (ZCTA and census tract), unnests the `operators` array, normalizes operator names to merge known variants, and writes a operator x primary_industry crosstab to output/. `primary_industry` is the leading *workforce* industry of the geography (derived from ACS 2024 employment), not the data center's line of business. Counts are operator-appearances per geography unit (one tally per geo an operator appears in). """ import os import re import csv from pathlib import Path from collections import defaultdict import psycopg2 PROJECT_ROOT = Path(__file__).resolve().parent.parent OUTPUT_DIR = PROJECT_ROOT / "output" GEOGRAPHIES = { "zcta": "data_center_zcta_2024", "census_tract": "data_center_census_tracts_2024", } def connect(): return psycopg2.connect( host=os.environ["PGWEB_HOST"], port=os.environ["PGWEB_PORT"], user=os.environ["PGWEB_USER"], password=os.environ["PGWEB_PASSWORD"], dbname="data_centers", ) # Exact-match canonicalization of operator-name variants -> canonical name. # Only merges that are unambiguously the same company. ALIASES = { # Amazon "Amazon": "Amazon Web Services", "Amazon AWS": "Amazon Web Services", "Amazon Web Serivces": "Amazon Web Services", "amazon web services": "Amazon Web Services", "Amazon IAD75": "Amazon Web Services", "Amazon IAD85": "Amazon Web Services", "Amazon IAD95": "Amazon Web Services", "Amazon IAD96": "Amazon Web Services", "Amazon Web Services us-east-2 datacenter": "Amazon Web Services", # Aligned "Aligned Data Centers": "Aligned", "Aligned Data Centers, LLC": "Aligned", # Apple "Apple Inc.": "Apple", # CenturyLink (case) "Centurylink": "CenturyLink", # Cogent "Cogent Communications, Inc.": "Cogent Communications", # CoreSite "CoreSite Data Center": "CoreSite", "CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite", # DataBank "Databank": "DataBank", # Digital Fortress "Digital Fortress, Inc": "Digital Fortress", # EdgeConneX "EdgeConnex": "EdgeConneX", # Google "Google LLC": "Google", # H5 "H5 Data Centers": "H5", # Lumen (incl. former CenturyLink brand kept separate) "Lumen Technologies": "Lumen", # Meta "Meta, Inc.": "Meta", "Facebook": "Meta", # Microsoft "Microsoft Azure": "Microsoft", # NTT "NTT Global Data Centers": "NTT", "NTT Limited": "NTT", # Prime "Prime": "Prime Data Centers", # QTS "QTS Data Centers": "Quality Technology Services", # Sabey "Sabey Data Centers": "Sabey", # Serverfarm "Serverfarm LLC.": "Serverfarm", # Stack "Stack Infrastructure, Incorporated": "Stack Infrastructure", # Stream "Stream Data Centers": "Stream", # T5 "T5 Data Centers": "T5", # Tata "TATA Communications": "Tata Communications", # Vantage "Vantage Data Centers": "Vantage", # Verizon "Verizon Business": "Verizon", "Verizon Wireless": "Verizon", # 1547 / fifteenfortyseven "fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty", } def normalize(op: str) -> str: op = re.sub(r"\s+", " ", op).strip() return ALIASES.get(op, op) def fetch(conn, table): """Return list of (normalized_operator, primary_industry) appearance rows.""" rows = [] with conn.cursor() as cur: cur.execute( f"SELECT primary_industry, unnest(operators) AS operator " f"FROM {table} WHERE operators IS NOT NULL" ) for primary_industry, operator in cur.fetchall(): if operator is None: continue rows.append((normalize(operator), primary_industry or "(unknown)")) return rows def build_crosstab(rows): industries = sorted({pi for _, pi in rows}) counts = defaultdict(lambda: defaultdict(int)) totals = defaultdict(int) for op, pi in rows: counts[op][pi] += 1 totals[op] += 1 operators = sorted(counts, key=lambda o: (-totals[o], o)) return operators, industries, counts, totals def write_csv(path, operators, industries, counts, totals): with open(path, "w", newline="") as f: w = csv.writer(f) w.writerow(["operator", "total"] + industries) for op in operators: w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries]) # column totals row col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries] w.writerow(["ALL", sum(totals.values())] + col_totals) def write_csv_rowpct(path, operators, industries, counts, totals): """Same crosstab, each cell as a percent of the operator's row total.""" with open(path, "w", newline="") as f: w = csv.writer(f) w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries]) for op in operators: tot = totals[op] pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries] w.writerow([op, tot] + pcts) def main(): OUTPUT_DIR.mkdir(exist_ok=True) conn = connect() try: for label, table in GEOGRAPHIES.items(): rows = fetch(conn, table) operators, industries, counts, totals = build_crosstab(rows) out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv" out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv" write_csv(out, operators, industries, counts, totals) write_csv_rowpct(out_pct, operators, industries, counts, totals) print( f"{label}: {len(operators)} operators x {len(industries)} industries, " f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}" ) finally: conn.close() if __name__ == "__main__": main()