enrich_roster.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. #!/usr/bin/env python3
  2. """Enrich the 119th-Congress roster via Congress.gov API.
  3. Writes:
  4. data/<C>/members_directory.json — dict keyed by bioguide
  5. data/<C>/lis_to_bioguide.json — Senate LIS -> bioguide crosswalk
  6. data/<C>/api_cache/<sha>.json — raw cached API responses (idempotent)
  7. Reads CONGRESS_GOV_API_KEY from ./.env (never CLI/env-var).
  8. Standard library only. Throttled 350 ms between live requests.
  9. """
  10. import argparse, hashlib, json, os, re, sys, time, unicodedata, urllib.error, urllib.request
  11. from pathlib import Path
  12. ROOT = Path(__file__).resolve().parent
  13. DATA_ROOT = ROOT / "data"
  14. ENV_PATH = ROOT / ".env"
  15. API_BASE = "https://api.congress.gov/v3"
  16. THROTTLE_SEC = 0.35
  17. RETRY_BACKOFFS = (0.5, 1.0, 2.0)
  18. PARTY_MAP = {
  19. "R": "R", "Republican": "R",
  20. "D": "D", "Democratic": "D", "Democrat": "D",
  21. "I": "I", "Independent": "I", "ID": "I",
  22. }
  23. STATE_NAME_TO_CODE = {
  24. "Alabama":"AL","Alaska":"AK","Arizona":"AZ","Arkansas":"AR","California":"CA",
  25. "Colorado":"CO","Connecticut":"CT","Delaware":"DE","Florida":"FL","Georgia":"GA",
  26. "Hawaii":"HI","Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS",
  27. "Kentucky":"KY","Louisiana":"LA","Maine":"ME","Maryland":"MD","Massachusetts":"MA",
  28. "Michigan":"MI","Minnesota":"MN","Mississippi":"MS","Missouri":"MO","Montana":"MT",
  29. "Nebraska":"NE","Nevada":"NV","New Hampshire":"NH","New Jersey":"NJ","New Mexico":"NM",
  30. "New York":"NY","North Carolina":"NC","North Dakota":"ND","Ohio":"OH","Oklahoma":"OK",
  31. "Oregon":"OR","Pennsylvania":"PA","Rhode Island":"RI","South Carolina":"SC",
  32. "South Dakota":"SD","Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT",
  33. "Virginia":"VA","Washington":"WA","West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY",
  34. "American Samoa":"AS","District of Columbia":"DC","Guam":"GU",
  35. "Northern Mariana Islands":"MP","Puerto Rico":"PR","Virgin Islands":"VI",
  36. }
  37. LIS_KEY_RE = re.compile(r"(?i)\b(lis|senateid|lis[_-]?id|lis[_-]?member[_-]?id)\b")
  38. def _load_api_key():
  39. if not ENV_PATH.exists():
  40. print(f"ERROR: .env not found at {ENV_PATH}", file=sys.stderr)
  41. sys.exit(2)
  42. for line in ENV_PATH.read_text().splitlines():
  43. line = line.strip()
  44. if not line or line.startswith("#") or "=" not in line:
  45. continue
  46. k, _, v = line.partition("=")
  47. if k.strip() == "CONGRESS_GOV_API_KEY":
  48. return v.strip().strip('"').strip("'")
  49. print("ERROR: CONGRESS_GOV_API_KEY not set in .env", file=sys.stderr)
  50. sys.exit(2)
  51. def _cache_path(cache_dir, url):
  52. # Cache key strips api_key so re-keying doesn't invalidate cache.
  53. clean = re.sub(r"([?&])api_key=[^&]*", r"\1", url).rstrip("?&")
  54. sha = hashlib.sha256(clean.encode("utf-8")).hexdigest()
  55. return cache_dir / f"{sha}.json"
  56. def _fetch(url, cache_dir, warnings, label="request"):
  57. cp = _cache_path(cache_dir, url)
  58. if cp.exists():
  59. try:
  60. return json.loads(cp.read_text())
  61. except Exception:
  62. pass # fall through and re-fetch
  63. last_err = None
  64. for i, backoff in enumerate((0,) + RETRY_BACKOFFS):
  65. if backoff:
  66. time.sleep(backoff)
  67. try:
  68. req = urllib.request.Request(url, headers={"Accept": "application/json", "User-Agent": "polisci-pipeline/1.0 (+enrich_roster.py)"})
  69. with urllib.request.urlopen(req, timeout=30) as resp:
  70. body = resp.read()
  71. data = json.loads(body)
  72. cp.parent.mkdir(parents=True, exist_ok=True)
  73. cp.write_text(json.dumps(data))
  74. time.sleep(THROTTLE_SEC)
  75. return data
  76. except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e:
  77. last_err = e
  78. warnings.append(f"{label}: {last_err}")
  79. return None
  80. def _party_letter(member):
  81. pa = member.get("partyHistory") or []
  82. if pa:
  83. last = pa[-1]
  84. for key in ("partyAbbreviation", "partyName"):
  85. v = last.get(key)
  86. if v and v in PARTY_MAP:
  87. return PARTY_MAP[v]
  88. pn = member.get("partyName") or ""
  89. return PARTY_MAP.get(pn, "I" if pn else "")
  90. def _state_code(member):
  91. s = member.get("state") or ""
  92. if len(s) == 2 and s.isupper():
  93. return s
  94. return STATE_NAME_TO_CODE.get(s, s[:2].upper() if s else "")
  95. def _latest_chamber(member):
  96. terms = _terms_list(member)
  97. if not terms:
  98. return ""
  99. def sk(t): return t.get("startYear") or 0
  100. last = sorted(terms, key=sk)[-1]
  101. return (last.get("chamber") or "").strip()
  102. def _terms_list(member):
  103. terms = member.get("terms")
  104. if isinstance(terms, dict):
  105. return terms.get("item") or []
  106. if isinstance(terms, list):
  107. return terms
  108. return []
  109. def _served_dates(member):
  110. terms = _terms_list(member)
  111. if not terms:
  112. return None, None
  113. starts, ends = [], []
  114. for t in terms:
  115. sy = t.get("startYear")
  116. ey = t.get("endYear")
  117. if sy:
  118. starts.append(f"{int(sy):04d}-01-03")
  119. if ey:
  120. ends.append(f"{int(ey):04d}-01-03")
  121. served_from = min(starts) if starts else None
  122. served_to = max(ends) if ends and len(ends) == len(starts) else None
  123. return served_from, served_to
  124. def _congress_term(member, congress):
  125. """Find the term for the target Congress; returns dict or None."""
  126. for t in _terms_list(member):
  127. if t.get("congress") == congress:
  128. return {
  129. "startYear": t.get("startYear"),
  130. "endYear": t.get("endYear"),
  131. "district": str(t["district"]) if t.get("district") is not None else None,
  132. "chamber": t.get("chamber"),
  133. }
  134. return None
  135. def _scan_for_lis(obj):
  136. """Recursively scan obj for any key matching LIS pattern; return string value or None."""
  137. if isinstance(obj, dict):
  138. for k, v in obj.items():
  139. if isinstance(k, str) and LIS_KEY_RE.search(k):
  140. if isinstance(v, str) and re.match(r"^S?\d{3,4}$", v.strip()):
  141. val = v.strip()
  142. if not val.startswith("S"):
  143. val = "S" + val.zfill(3)
  144. return val
  145. found = _scan_for_lis(v)
  146. if found:
  147. return found
  148. elif isinstance(obj, list):
  149. for it in obj:
  150. found = _scan_for_lis(it)
  151. if found:
  152. return found
  153. return None
  154. def _normalize_member(m, congress=None):
  155. bioguide = (m.get("bioguideId") or "").strip()
  156. if not bioguide:
  157. return None
  158. chamber = _latest_chamber(m)
  159. served_from, served_to = _served_dates(m)
  160. district = m.get("district")
  161. if district is not None:
  162. district = str(district)
  163. name = m.get("directOrderName") or m.get("name") or ""
  164. if not name:
  165. first = m.get("firstName") or ""
  166. last = m.get("lastName") or ""
  167. name = (first + " " + last).strip()
  168. if "," in name and not m.get("directOrderName"):
  169. parts = [p.strip() for p in name.split(",", 1)]
  170. if len(parts) == 2:
  171. name = parts[1] + " " + parts[0]
  172. photo = ((m.get("depiction") or {}).get("imageUrl")) or None
  173. # Per-Congress term — most accurate source of district, start/end year for
  174. # this Congress (matters for mid-term resignations and special-election entrants).
  175. term = _congress_term(m, congress) if congress is not None else None
  176. if term and term.get("district") is not None:
  177. district = term["district"]
  178. term_chamber = (term or {}).get("chamber") or chamber
  179. return {
  180. "bioguide": bioguide,
  181. "lis": None,
  182. "full_name": name,
  183. "party": _party_letter(m),
  184. "state": _state_code(m),
  185. "district": district if (term_chamber or "").lower().startswith("house") else None,
  186. "chamber": term_chamber,
  187. "served_from": served_from,
  188. "served_to": served_to,
  189. "congress_term": term,
  190. "death_year": m.get("deathYear"),
  191. "current_member": m.get("currentMember"),
  192. "photo_url": photo,
  193. "source": "congress.gov/v3",
  194. }
  195. def _write_partial(out_dir, directory, lis_map, warnings, note):
  196. out_dir.mkdir(parents=True, exist_ok=True)
  197. (out_dir / "members_directory.json").write_text(
  198. json.dumps(directory, indent=2, sort_keys=True))
  199. (out_dir / "lis_to_bioguide.json").write_text(
  200. json.dumps(lis_map, indent=2, sort_keys=True))
  201. print(f"enrich_roster: WARNING {note}", file=sys.stderr)
  202. def main():
  203. ap = argparse.ArgumentParser(description="Enrich Congress roster via Congress.gov API.")
  204. ap.add_argument("--congress", type=int, default=119)
  205. args = ap.parse_args()
  206. api_key = _load_api_key()
  207. out_dir = DATA_ROOT / str(args.congress)
  208. cache_dir = out_dir / "api_cache"
  209. cache_dir.mkdir(parents=True, exist_ok=True)
  210. warnings = []
  211. directory = {}
  212. # Pass 1: paginate full member list
  213. url = (f"{API_BASE}/member/congress/{args.congress}"
  214. f"?currentMember=false&limit=250&format=json&api_key={api_key}")
  215. page = 0
  216. while url:
  217. page += 1
  218. print(f"enrich_roster: fetching page {page}", file=sys.stderr)
  219. data = _fetch(url, cache_dir, warnings, label=f"page {page}")
  220. if data is None:
  221. _write_partial(out_dir, directory, {}, warnings,
  222. f"pagination failed at page {page}; partial directory written")
  223. print(f"enrich_roster: {len(directory)} members directory written; "
  224. f"0 senators with LIS resolved; {len(warnings)} warnings")
  225. return 0
  226. for m in data.get("members") or []:
  227. norm = _normalize_member(m, args.congress)
  228. if norm:
  229. directory[norm["bioguide"]] = norm
  230. nxt = ((data.get("pagination") or {}).get("next")) or None
  231. if nxt and "api_key=" not in nxt:
  232. sep = "&" if "?" in nxt else "?"
  233. nxt = f"{nxt}{sep}api_key={api_key}"
  234. url = nxt
  235. # Pass 2: LIS lookup for senators
  236. lis_map = {}
  237. senators = [b for b, e in directory.items() if (e.get("chamber") or "").lower() == "senate"]
  238. print(f"enrich_roster: resolving LIS for {len(senators)} senators", file=sys.stderr)
  239. resolved = 0
  240. for bid in senators:
  241. url = f"{API_BASE}/member/{bid}?format=json&api_key={api_key}"
  242. data = _fetch(url, cache_dir, warnings, label=f"member/{bid}")
  243. if data is None:
  244. warnings.append(f"member/{bid}: fetch failed")
  245. continue
  246. member = (data.get("member") or {})
  247. lis = _scan_for_lis(member)
  248. if lis:
  249. directory[bid]["lis"] = lis
  250. lis_map[lis] = bid
  251. resolved += 1
  252. else:
  253. warnings.append(f"member/{bid}: LIS not derivable from API")
  254. # Fallback: name+state+party match against vote-derived senate roster.
  255. # Why: Congress.gov v3 does not expose LIS reliably; name match is unique within a Congress.
  256. sen_roster_path = out_dir / "senate" / "roster.json"
  257. if sen_roster_path.exists():
  258. def _norm(s):
  259. return ''.join(c for c in unicodedata.normalize('NFKD', s or '') if not unicodedata.combining(c))
  260. def _last(n):
  261. p = re.sub(r'[.,]', '', _norm(n)).split()
  262. return p[-1].lower() if p else ''
  263. vote_sen = json.loads(sen_roster_path.read_text())
  264. idx = {}
  265. for bg in senators:
  266. e = directory[bg]
  267. idx.setdefault((_last(e.get("full_name", "")), e.get("state", ""), e.get("party", "")), []).append(bg)
  268. for lis_key, v in vote_sen.items():
  269. if re.match(r'^[A-Z]\d{6}$', lis_key):
  270. continue
  271. if lis_key in lis_map:
  272. continue
  273. k = (_last(v.get("name", "")), v.get("state", ""), v.get("party", ""))
  274. candidates = idx.get(k, [])
  275. if len(candidates) != 1:
  276. candidates = [bg for (l, s, _), bs in idx.items() for bg in bs
  277. if l == k[0] and s == k[1]]
  278. if len(candidates) == 1:
  279. bg = candidates[0]
  280. directory[bg]["lis"] = lis_key
  281. lis_map[lis_key] = bg
  282. resolved += 1
  283. out_dir.mkdir(parents=True, exist_ok=True)
  284. # Fallback: individual lookups for House bioguide IDs that appear in vote
  285. # data but are missing from the per-Congress directory. Catches people who
  286. # were members-elect (appear in opening-day quorum XML) but never seated,
  287. # e.g. Matt Gaetz in the 119th.
  288. house_roster_path = out_dir / "house" / "roster.json"
  289. rescued = 0
  290. if house_roster_path.exists():
  291. house_roster = json.loads(house_roster_path.read_text())
  292. missing = [bg for bg in house_roster
  293. if re.match(r"^[A-Z]\d{6}$", bg) and bg not in directory]
  294. if missing:
  295. print(f"enrich_roster: rescuing {len(missing)} House bioguide(s) missing from bulk directory",
  296. file=sys.stderr)
  297. for bg in missing:
  298. url = f"{API_BASE}/member/{bg}?format=json&api_key={api_key}"
  299. data = _fetch(url, cache_dir, warnings, label=f"member/{bg}")
  300. if data is None:
  301. continue
  302. member = (data.get("member") or {})
  303. norm = _normalize_member(member, args.congress)
  304. if norm:
  305. directory[bg] = norm
  306. rescued += 1
  307. # Replacement-linking pass — pair predecessor↔successor by (state, district)
  308. # within the target Congress. Heuristic: any House seat with >1 member whose
  309. # 119th term touches the Congress window. Sort by startYear (and then by
  310. # served_to is-null) to determine order.
  311. seats = {}
  312. for bg, e in directory.items():
  313. if not (e.get("chamber") or "").lower().startswith("house"):
  314. continue
  315. term = e.get("congress_term") or {}
  316. if term.get("congress") and term["congress"] != args.congress:
  317. continue # shouldn't happen, but safe
  318. state = e.get("state")
  319. district = (term.get("district") if term else None) or e.get("district")
  320. if not state or district is None:
  321. continue
  322. seats.setdefault((state, str(district)), []).append(bg)
  323. pairs = 0
  324. for key, bgs in seats.items():
  325. if len(bgs) < 2:
  326. continue
  327. def sortkey(bg):
  328. e = directory[bg]
  329. term = e.get("congress_term") or {}
  330. start = term.get("startYear") or 9999
  331. # served_to None => still serving => sort last
  332. ended = e.get("served_to") is not None
  333. return (start, 0 if ended else 1)
  334. ordered = sorted(bgs, key=sortkey)
  335. for i in range(len(ordered) - 1):
  336. pred, succ = ordered[i], ordered[i + 1]
  337. directory[pred]["replaced_by"] = succ
  338. directory[succ]["replaces"] = pred
  339. pairs += 1
  340. if pairs:
  341. print(f"enrich_roster: linked {pairs} House predecessor↔successor pair(s)",
  342. file=sys.stderr)
  343. # Per-Congress term + death_year live on the individual /member/{bg} response
  344. # (the bulk listing only carries chamber + startYear). For accurate banner
  345. # copy on replacement chains, fetch the individual record for every member
  346. # who is on either side of a replacement pair. Cached, so re-runs are free.
  347. enrich_targets = set()
  348. for bg, e in directory.items():
  349. if e.get("replaces") or e.get("replaced_by"):
  350. enrich_targets.add(bg)
  351. if enrich_targets:
  352. print(f"enrich_roster: fetching detail for {len(enrich_targets)} replacement-chain members",
  353. file=sys.stderr)
  354. for bg in sorted(enrich_targets):
  355. url = f"{API_BASE}/member/{bg}?format=json&api_key={api_key}"
  356. data = _fetch(url, cache_dir, warnings, label=f"member-detail/{bg}")
  357. if data is None:
  358. continue
  359. member = (data.get("member") or {})
  360. term = _congress_term(member, args.congress)
  361. if term:
  362. directory[bg]["congress_term"] = term
  363. # If individual endpoint reports a per-Congress district, prefer it.
  364. if term.get("district") is not None:
  365. directory[bg]["district"] = term["district"]
  366. if member.get("deathYear") is not None:
  367. directory[bg]["death_year"] = member.get("deathYear")
  368. if member.get("currentMember") is not None:
  369. directory[bg]["current_member"] = member.get("currentMember")
  370. (out_dir / "members_directory.json").write_text(
  371. json.dumps(directory, indent=2, sort_keys=True))
  372. (out_dir / "lis_to_bioguide.json").write_text(
  373. json.dumps(lis_map, indent=2, sort_keys=True))
  374. print(f"enrich_roster: {len(directory)} members directory written; "
  375. f"{resolved} senators with LIS resolved; {rescued} House rescues; "
  376. f"{pairs} replacements linked; {len(warnings)} warnings")
  377. for w in warnings[:10]:
  378. print(f" warn: {w}", file=sys.stderr)
  379. return 0
  380. if __name__ == "__main__":
  381. sys.exit(main())