Files
data-centers/scripts/crosstab_operator_industry.py
dadams 2c50c969bf Add operator x dominant workforce industry crosstab script and CSVs
Cross-tabs normalized data-center operator (owner) against the leading
ACS 2024 workforce industry of each enrichment geography (ZCTA and census
tract). Emits raw-count and row-percentage CSVs for both geographies.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 20:57:58 -07:00

185 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry.
For each enrichment geography (ZCTA and census tract), unnests the `operators`
array, normalizes operator names to merge known variants, and writes a
operator x primary_industry crosstab to output/.
`primary_industry` is the leading *workforce* industry of the geography
(derived from ACS 2024 employment), not the data center's line of business.
Counts are operator-appearances per geography unit (one tally per geo an
operator appears in).
"""
import os
import re
import csv
from pathlib import Path
from collections import defaultdict
import psycopg2
PROJECT_ROOT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = PROJECT_ROOT / "output"
GEOGRAPHIES = {
"zcta": "data_center_zcta_2024",
"census_tract": "data_center_census_tracts_2024",
}
def connect():
return psycopg2.connect(
host=os.environ["PGWEB_HOST"],
port=os.environ["PGWEB_PORT"],
user=os.environ["PGWEB_USER"],
password=os.environ["PGWEB_PASSWORD"],
dbname="data_centers",
)
# Exact-match canonicalization of operator-name variants -> canonical name.
# Only merges that are unambiguously the same company.
ALIASES = {
# Amazon
"Amazon": "Amazon Web Services",
"Amazon AWS": "Amazon Web Services",
"Amazon Web Serivces": "Amazon Web Services",
"amazon web services": "Amazon Web Services",
"Amazon IAD75": "Amazon Web Services",
"Amazon IAD85": "Amazon Web Services",
"Amazon IAD95": "Amazon Web Services",
"Amazon IAD96": "Amazon Web Services",
"Amazon Web Services us-east-2 datacenter": "Amazon Web Services",
# Aligned
"Aligned Data Centers": "Aligned",
"Aligned Data Centers, LLC": "Aligned",
# Apple
"Apple Inc.": "Apple",
# CenturyLink (case)
"Centurylink": "CenturyLink",
# Cogent
"Cogent Communications, Inc.": "Cogent Communications",
# CoreSite
"CoreSite Data Center": "CoreSite",
"CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite",
# DataBank
"Databank": "DataBank",
# Digital Fortress
"Digital Fortress, Inc": "Digital Fortress",
# EdgeConneX
"EdgeConnex": "EdgeConneX",
# Google
"Google LLC": "Google",
# H5
"H5 Data Centers": "H5",
# Lumen (incl. former CenturyLink brand kept separate)
"Lumen Technologies": "Lumen",
# Meta
"Meta, Inc.": "Meta",
"Facebook": "Meta",
# Microsoft
"Microsoft Azure": "Microsoft",
# NTT
"NTT Global Data Centers": "NTT",
"NTT Limited": "NTT",
# Prime
"Prime": "Prime Data Centers",
# QTS
"QTS Data Centers": "Quality Technology Services",
# Sabey
"Sabey Data Centers": "Sabey",
# Serverfarm
"Serverfarm LLC.": "Serverfarm",
# Stack
"Stack Infrastructure, Incorporated": "Stack Infrastructure",
# Stream
"Stream Data Centers": "Stream",
# T5
"T5 Data Centers": "T5",
# Tata
"TATA Communications": "Tata Communications",
# Vantage
"Vantage Data Centers": "Vantage",
# Verizon
"Verizon Business": "Verizon",
"Verizon Wireless": "Verizon",
# 1547 / fifteenfortyseven
"fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty",
}
def normalize(op: str) -> str:
op = re.sub(r"\s+", " ", op).strip()
return ALIASES.get(op, op)
def fetch(conn, table):
"""Return list of (normalized_operator, primary_industry) appearance rows."""
rows = []
with conn.cursor() as cur:
cur.execute(
f"SELECT primary_industry, unnest(operators) AS operator "
f"FROM {table} WHERE operators IS NOT NULL"
)
for primary_industry, operator in cur.fetchall():
if operator is None:
continue
rows.append((normalize(operator), primary_industry or "(unknown)"))
return rows
def build_crosstab(rows):
industries = sorted({pi for _, pi in rows})
counts = defaultdict(lambda: defaultdict(int))
totals = defaultdict(int)
for op, pi in rows:
counts[op][pi] += 1
totals[op] += 1
operators = sorted(counts, key=lambda o: (-totals[o], o))
return operators, industries, counts, totals
def write_csv(path, operators, industries, counts, totals):
with open(path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["operator", "total"] + industries)
for op in operators:
w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries])
# column totals row
col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries]
w.writerow(["ALL", sum(totals.values())] + col_totals)
def write_csv_rowpct(path, operators, industries, counts, totals):
"""Same crosstab, each cell as a percent of the operator's row total."""
with open(path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries])
for op in operators:
tot = totals[op]
pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries]
w.writerow([op, tot] + pcts)
def main():
OUTPUT_DIR.mkdir(exist_ok=True)
conn = connect()
try:
for label, table in GEOGRAPHIES.items():
rows = fetch(conn, table)
operators, industries, counts, totals = build_crosstab(rows)
out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv"
out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv"
write_csv(out, operators, industries, counts, totals)
write_csv_rowpct(out_pct, operators, industries, counts, totals)
print(
f"{label}: {len(operators)} operators x {len(industries)} industries, "
f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}"
)
finally:
conn.close()
if __name__ == "__main__":
main()