Cross-tabs normalized data-center operator (owner) against the leading ACS 2024 workforce industry of each enrichment geography (ZCTA and census tract). Emits raw-count and row-percentage CSVs for both geographies. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
185 lines
5.9 KiB
Python
185 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Cross-tab data-center operator (owner) x dominant ZCTA/tract workforce industry.
|
|
|
|
For each enrichment geography (ZCTA and census tract), unnests the `operators`
|
|
array, normalizes operator names to merge known variants, and writes a
|
|
operator x primary_industry crosstab to output/.
|
|
|
|
`primary_industry` is the leading *workforce* industry of the geography
|
|
(derived from ACS 2024 employment), not the data center's line of business.
|
|
Counts are operator-appearances per geography unit (one tally per geo an
|
|
operator appears in).
|
|
"""
|
|
import os
|
|
import re
|
|
import csv
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
import psycopg2
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
OUTPUT_DIR = PROJECT_ROOT / "output"
|
|
|
|
GEOGRAPHIES = {
|
|
"zcta": "data_center_zcta_2024",
|
|
"census_tract": "data_center_census_tracts_2024",
|
|
}
|
|
|
|
|
|
def connect():
|
|
return psycopg2.connect(
|
|
host=os.environ["PGWEB_HOST"],
|
|
port=os.environ["PGWEB_PORT"],
|
|
user=os.environ["PGWEB_USER"],
|
|
password=os.environ["PGWEB_PASSWORD"],
|
|
dbname="data_centers",
|
|
)
|
|
|
|
|
|
# Exact-match canonicalization of operator-name variants -> canonical name.
|
|
# Only merges that are unambiguously the same company.
|
|
ALIASES = {
|
|
# Amazon
|
|
"Amazon": "Amazon Web Services",
|
|
"Amazon AWS": "Amazon Web Services",
|
|
"Amazon Web Serivces": "Amazon Web Services",
|
|
"amazon web services": "Amazon Web Services",
|
|
"Amazon IAD75": "Amazon Web Services",
|
|
"Amazon IAD85": "Amazon Web Services",
|
|
"Amazon IAD95": "Amazon Web Services",
|
|
"Amazon IAD96": "Amazon Web Services",
|
|
"Amazon Web Services us-east-2 datacenter": "Amazon Web Services",
|
|
# Aligned
|
|
"Aligned Data Centers": "Aligned",
|
|
"Aligned Data Centers, LLC": "Aligned",
|
|
# Apple
|
|
"Apple Inc.": "Apple",
|
|
# CenturyLink (case)
|
|
"Centurylink": "CenturyLink",
|
|
# Cogent
|
|
"Cogent Communications, Inc.": "Cogent Communications",
|
|
# CoreSite
|
|
"CoreSite Data Center": "CoreSite",
|
|
"CoreSite Real Estate 1656 McCarthy, L.P.": "CoreSite",
|
|
# DataBank
|
|
"Databank": "DataBank",
|
|
# Digital Fortress
|
|
"Digital Fortress, Inc": "Digital Fortress",
|
|
# EdgeConneX
|
|
"EdgeConnex": "EdgeConneX",
|
|
# Google
|
|
"Google LLC": "Google",
|
|
# H5
|
|
"H5 Data Centers": "H5",
|
|
# Lumen (incl. former CenturyLink brand kept separate)
|
|
"Lumen Technologies": "Lumen",
|
|
# Meta
|
|
"Meta, Inc.": "Meta",
|
|
"Facebook": "Meta",
|
|
# Microsoft
|
|
"Microsoft Azure": "Microsoft",
|
|
# NTT
|
|
"NTT Global Data Centers": "NTT",
|
|
"NTT Limited": "NTT",
|
|
# Prime
|
|
"Prime": "Prime Data Centers",
|
|
# QTS
|
|
"QTS Data Centers": "Quality Technology Services",
|
|
# Sabey
|
|
"Sabey Data Centers": "Sabey",
|
|
# Serverfarm
|
|
"Serverfarm LLC.": "Serverfarm",
|
|
# Stack
|
|
"Stack Infrastructure, Incorporated": "Stack Infrastructure",
|
|
# Stream
|
|
"Stream Data Centers": "Stream",
|
|
# T5
|
|
"T5 Data Centers": "T5",
|
|
# Tata
|
|
"TATA Communications": "Tata Communications",
|
|
# Vantage
|
|
"Vantage Data Centers": "Vantage",
|
|
# Verizon
|
|
"Verizon Business": "Verizon",
|
|
"Verizon Wireless": "Verizon",
|
|
# 1547 / fifteenfortyseven
|
|
"fifteenfortyseven Critical Systems Realty": "1547 Critical Systems Realty",
|
|
}
|
|
|
|
|
|
def normalize(op: str) -> str:
|
|
op = re.sub(r"\s+", " ", op).strip()
|
|
return ALIASES.get(op, op)
|
|
|
|
|
|
def fetch(conn, table):
|
|
"""Return list of (normalized_operator, primary_industry) appearance rows."""
|
|
rows = []
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
f"SELECT primary_industry, unnest(operators) AS operator "
|
|
f"FROM {table} WHERE operators IS NOT NULL"
|
|
)
|
|
for primary_industry, operator in cur.fetchall():
|
|
if operator is None:
|
|
continue
|
|
rows.append((normalize(operator), primary_industry or "(unknown)"))
|
|
return rows
|
|
|
|
|
|
def build_crosstab(rows):
|
|
industries = sorted({pi for _, pi in rows})
|
|
counts = defaultdict(lambda: defaultdict(int))
|
|
totals = defaultdict(int)
|
|
for op, pi in rows:
|
|
counts[op][pi] += 1
|
|
totals[op] += 1
|
|
operators = sorted(counts, key=lambda o: (-totals[o], o))
|
|
return operators, industries, counts, totals
|
|
|
|
|
|
def write_csv(path, operators, industries, counts, totals):
|
|
with open(path, "w", newline="") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(["operator", "total"] + industries)
|
|
for op in operators:
|
|
w.writerow([op, totals[op]] + [counts[op].get(pi, 0) for pi in industries])
|
|
# column totals row
|
|
col_totals = [sum(counts[op].get(pi, 0) for op in operators) for pi in industries]
|
|
w.writerow(["ALL", sum(totals.values())] + col_totals)
|
|
|
|
|
|
def write_csv_rowpct(path, operators, industries, counts, totals):
|
|
"""Same crosstab, each cell as a percent of the operator's row total."""
|
|
with open(path, "w", newline="") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(["operator", "total"] + [f"{pi} (%)" for pi in industries])
|
|
for op in operators:
|
|
tot = totals[op]
|
|
pcts = [round(100 * counts[op].get(pi, 0) / tot, 1) for pi in industries]
|
|
w.writerow([op, tot] + pcts)
|
|
|
|
|
|
def main():
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
conn = connect()
|
|
try:
|
|
for label, table in GEOGRAPHIES.items():
|
|
rows = fetch(conn, table)
|
|
operators, industries, counts, totals = build_crosstab(rows)
|
|
out = OUTPUT_DIR / f"operator_industry_crosstab_{label}.csv"
|
|
out_pct = OUTPUT_DIR / f"operator_industry_crosstab_{label}_rowpct.csv"
|
|
write_csv(out, operators, industries, counts, totals)
|
|
write_csv_rowpct(out_pct, operators, industries, counts, totals)
|
|
print(
|
|
f"{label}: {len(operators)} operators x {len(industries)} industries, "
|
|
f"{sum(totals.values())} appearances -> {out.name}, {out_pct.name}"
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|