Add operator x dominant workforce industry crosstab script and CSVs
Cross-tabs normalized data-center operator (owner) against the leading ACS 2024 workforce industry of each enrichment geography (ZCTA and census tract). Emits raw-count and row-percentage CSVs for both geographies. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
184
scripts/crosstab_operator_industry.py
Normal file
184
scripts/crosstab_operator_industry.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry.
|
||||
|
||||
For each enrichment geography (ZCTA and census tract), unnests the `operators`
|
||||
array, normalizes operator names to merge known variants, and writes a
|
||||
operator x primary_industry crosstab to output/.
|
||||
|
||||
`primary_industry` is the leading *workforce* industry of the geography
|
||||
(derived from ACS 2024 employment), not the data center's line of business.
|
||||
Counts are operator-appearances per geography unit (one tally per geo an
|
||||
operator appears in).
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
import psycopg2
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
OUTPUT_DIR = PROJECT_ROOT / "output"
|
||||
|
||||
GEOGRAPHIES = {
|
||||
"zcta": "data_center_zcta_2024",
|
||||
"census_tract": "data_center_census_tracts_2024",
|
||||
}
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
host=os.environ["PGWEB_HOST"],
|
||||
port=os.environ["PGWEB_PORT"],
|
||||
user=os.environ["PGWEB_USER"],
|
||||
password=os.environ["PGWEB_PASSWORD"],
|
||||
dbname="data_centers",
|
||||
)
|
||||
|
||||
|
||||
# Exact-match canonicalization of operator-name variants -> canonical name.
|
||||
# Only merges that are unambiguously the same company.
|
||||
ALIASES = {
|
||||
# Amazon
|
||||
"Amazon": "Amazon Web Services",
|
||||
"Amazon AWS": "Amazon Web Services",
|
||||
"Amazon Web Serivces": "Amazon Web Services",
|
||||
"amazon web services": "Amazon Web Services",
|
||||
"Amazon IAD75": "Amazon Web Services",
|
||||
"Amazon IAD85": "Amazon Web Services",
|
||||
"Amazon IAD95": "Amazon Web Services",
|
||||
"Amazon IAD96": "Amazon Web Services",
|
||||
"Amazon Web Services us-east-2 datacenter": "Amazon Web Services",
|
||||
# Aligned
|
||||
"Aligned Data Centers": "Aligned",
|
||||
"Aligned Data Centers, LLC": "Aligned",
|
||||
# Apple
|
||||
"Apple Inc.": "Apple",
|
||||
# CenturyLink (case)
|
||||
"Centurylink": "CenturyLink",
|
||||
# Cogent
|
||||
"Cogent Communications, Inc.": "Cogent Communications",
|
||||
# CoreSite
|
||||
"CoreSite Data Center": "CoreSite",
|
||||
"CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite",
|
||||
# DataBank
|
||||
"Databank": "DataBank",
|
||||
# Digital Fortress
|
||||
"Digital Fortress, Inc": "Digital Fortress",
|
||||
# EdgeConneX
|
||||
"EdgeConnex": "EdgeConneX",
|
||||
# Google
|
||||
"Google LLC": "Google",
|
||||
# H5
|
||||
"H5 Data Centers": "H5",
|
||||
# Lumen (incl. former CenturyLink brand kept separate)
|
||||
"Lumen Technologies": "Lumen",
|
||||
# Meta
|
||||
"Meta, Inc.": "Meta",
|
||||
"Facebook": "Meta",
|
||||
# Microsoft
|
||||
"Microsoft Azure": "Microsoft",
|
||||
# NTT
|
||||
"NTT Global Data Centers": "NTT",
|
||||
"NTT Limited": "NTT",
|
||||
# Prime
|
||||
"Prime": "Prime Data Centers",
|
||||
# QTS
|
||||
"QTS Data Centers": "Quality Technology Services",
|
||||
# Sabey
|
||||
"Sabey Data Centers": "Sabey",
|
||||
# Serverfarm
|
||||
"Serverfarm LLC.": "Serverfarm",
|
||||
# Stack
|
||||
"Stack Infrastructure, Incorporated": "Stack Infrastructure",
|
||||
# Stream
|
||||
"Stream Data Centers": "Stream",
|
||||
# T5
|
||||
"T5 Data Centers": "T5",
|
||||
# Tata
|
||||
"TATA Communications": "Tata Communications",
|
||||
# Vantage
|
||||
"Vantage Data Centers": "Vantage",
|
||||
# Verizon
|
||||
"Verizon Business": "Verizon",
|
||||
"Verizon Wireless": "Verizon",
|
||||
# 1547 / fifteenfortyseven
|
||||
"fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty",
|
||||
}
|
||||
|
||||
|
||||
def normalize(op: str) -> str:
|
||||
op = re.sub(r"\s+", " ", op).strip()
|
||||
return ALIASES.get(op, op)
|
||||
|
||||
|
||||
def fetch(conn, table):
|
||||
"""Return list of (normalized_operator, primary_industry) appearance rows."""
|
||||
rows = []
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"SELECT primary_industry, unnest(operators) AS operator "
|
||||
f"FROM {table} WHERE operators IS NOT NULL"
|
||||
)
|
||||
for primary_industry, operator in cur.fetchall():
|
||||
if operator is None:
|
||||
continue
|
||||
rows.append((normalize(operator), primary_industry or "(unknown)"))
|
||||
return rows
|
||||
|
||||
|
||||
def build_crosstab(rows):
|
||||
industries = sorted({pi for _, pi in rows})
|
||||
counts = defaultdict(lambda: defaultdict(int))
|
||||
totals = defaultdict(int)
|
||||
for op, pi in rows:
|
||||
counts[op][pi] += 1
|
||||
totals[op] += 1
|
||||
operators = sorted(counts, key=lambda o: (-totals[o], o))
|
||||
return operators, industries, counts, totals
|
||||
|
||||
|
||||
def write_csv(path, operators, industries, counts, totals):
|
||||
with open(path, "w", newline="") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["operator", "total"] + industries)
|
||||
for op in operators:
|
||||
w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries])
|
||||
# column totals row
|
||||
col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries]
|
||||
w.writerow(["ALL", sum(totals.values())] + col_totals)
|
||||
|
||||
|
||||
def write_csv_rowpct(path, operators, industries, counts, totals):
|
||||
"""Same crosstab, each cell as a percent of the operator's row total."""
|
||||
with open(path, "w", newline="") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries])
|
||||
for op in operators:
|
||||
tot = totals[op]
|
||||
pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries]
|
||||
w.writerow([op, tot] + pcts)
|
||||
|
||||
|
||||
def main():
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
conn = connect()
|
||||
try:
|
||||
for label, table in GEOGRAPHIES.items():
|
||||
rows = fetch(conn, table)
|
||||
operators, industries, counts, totals = build_crosstab(rows)
|
||||
out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv"
|
||||
out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv"
|
||||
write_csv(out, operators, industries, counts, totals)
|
||||
write_csv_rowpct(out_pct, operators, industries, counts, totals)
|
||||
print(
|
||||
f"{label}: {len(operators)} operators x {len(industries)} industries, "
|
||||
f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}"
|
||||
)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user