enrich_roster.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #!/usr/bin/env python3
  2. """Enrich the 119th-Congress roster via Congress.gov API.
  3. Writes:
  4. data/<C>/members_directory.json — dict keyed by bioguide
  5. data/<C>/lis_to_bioguide.json — Senate LIS -> bioguide crosswalk
  6. data/<C>/api_cache/<sha>.json — raw cached API responses (idempotent)
  7. Reads CONGRESS_GOV_API_KEY from ./.env (never CLI/env-var).
  8. Standard library only. Throttled 350 ms between live requests.
  9. """
  10. import argparse, hashlib, json, os, re, sys, time, unicodedata, urllib.error, urllib.request
  11. from pathlib import Path
  12. ROOT = Path(__file__).resolve().parent
  13. DATA_ROOT = ROOT / "data"
  14. ENV_PATH = ROOT / ".env"
  15. API_BASE = "https://api.congress.gov/v3"
  16. THROTTLE_SEC = 0.35
  17. RETRY_BACKOFFS = (0.5, 1.0, 2.0)
  18. PARTY_MAP = {
  19. "R": "R", "Republican": "R",
  20. "D": "D", "Democratic": "D", "Democrat": "D",
  21. "I": "I", "Independent": "I", "ID": "I",
  22. }
  23. STATE_NAME_TO_CODE = {
  24. "Alabama":"AL","Alaska":"AK","Arizona":"AZ","Arkansas":"AR","California":"CA",
  25. "Colorado":"CO","Connecticut":"CT","Delaware":"DE","Florida":"FL","Georgia":"GA",
  26. "Hawaii":"HI","Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS",
  27. "Kentucky":"KY","Louisiana":"LA","Maine":"ME","Maryland":"MD","Massachusetts":"MA",
  28. "Michigan":"MI","Minnesota":"MN","Mississippi":"MS","Missouri":"MO","Montana":"MT",
  29. "Nebraska":"NE","Nevada":"NV","New Hampshire":"NH","New Jersey":"NJ","New Mexico":"NM",
  30. "New York":"NY","North Carolina":"NC","North Dakota":"ND","Ohio":"OH","Oklahoma":"OK",
  31. "Oregon":"OR","Pennsylvania":"PA","Rhode Island":"RI","South Carolina":"SC",
  32. "South Dakota":"SD","Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT",
  33. "Virginia":"VA","Washington":"WA","West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY",
  34. "American Samoa":"AS","District of Columbia":"DC","Guam":"GU",
  35. "Northern Mariana Islands":"MP","Puerto Rico":"PR","Virgin Islands":"VI",
  36. }
  37. LIS_KEY_RE = re.compile(r"(?i)\b(lis|senateid|lis[_-]?id|lis[_-]?member[_-]?id)\b")
  38. def _load_api_key():
  39. if not ENV_PATH.exists():
  40. print(f"ERROR: .env not found at {ENV_PATH}", file=sys.stderr)
  41. sys.exit(2)
  42. for line in ENV_PATH.read_text().splitlines():
  43. line = line.strip()
  44. if not line or line.startswith("#") or "=" not in line:
  45. continue
  46. k, _, v = line.partition("=")
  47. if k.strip() == "CONGRESS_GOV_API_KEY":
  48. return v.strip().strip('"').strip("'")
  49. print("ERROR: CONGRESS_GOV_API_KEY not set in .env", file=sys.stderr)
  50. sys.exit(2)
  51. def _cache_path(cache_dir, url):
  52. # Cache key strips api_key so re-keying doesn't invalidate cache.
  53. clean = re.sub(r"([?&])api_key=[^&]*", r"\1", url).rstrip("?&")
  54. sha = hashlib.sha256(clean.encode("utf-8")).hexdigest()
  55. return cache_dir / f"{sha}.json"
  56. def _fetch(url, cache_dir, warnings, label="request"):
  57. cp = _cache_path(cache_dir, url)
  58. if cp.exists():
  59. try:
  60. return json.loads(cp.read_text())
  61. except Exception:
  62. pass # fall through and re-fetch
  63. last_err = None
  64. for i, backoff in enumerate((0,) + RETRY_BACKOFFS):
  65. if backoff:
  66. time.sleep(backoff)
  67. try:
  68. req = urllib.request.Request(url, headers={"Accept": "application/json", "User-Agent": "polisci-pipeline/1.0 (+enrich_roster.py)"})
  69. with urllib.request.urlopen(req, timeout=30) as resp:
  70. body = resp.read()
  71. data = json.loads(body)
  72. cp.parent.mkdir(parents=True, exist_ok=True)
  73. cp.write_text(json.dumps(data))
  74. time.sleep(THROTTLE_SEC)
  75. return data
  76. except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e:
  77. last_err = e
  78. warnings.append(f"{label}: {last_err}")
  79. return None
  80. def _party_letter(member):
  81. pa = member.get("partyHistory") or []
  82. if pa:
  83. last = pa[-1]
  84. for key in ("partyAbbreviation", "partyName"):
  85. v = last.get(key)
  86. if v and v in PARTY_MAP:
  87. return PARTY_MAP[v]
  88. pn = member.get("partyName") or ""
  89. return PARTY_MAP.get(pn, "I" if pn else "")
  90. def _state_code(member):
  91. s = member.get("state") or ""
  92. if len(s) == 2 and s.isupper():
  93. return s
  94. return STATE_NAME_TO_CODE.get(s, s[:2].upper() if s else "")
  95. def _latest_chamber(member):
  96. terms = (member.get("terms") or {}).get("item") or member.get("terms") or []
  97. if isinstance(terms, dict):
  98. terms = terms.get("item") or []
  99. if not terms:
  100. return ""
  101. # Sort by startYear if available
  102. def sk(t): return t.get("startYear") or 0
  103. last = sorted(terms, key=sk)[-1]
  104. return (last.get("chamber") or "").strip()
  105. def _served_dates(member):
  106. terms = (member.get("terms") or {}).get("item") or member.get("terms") or []
  107. if isinstance(terms, dict):
  108. terms = terms.get("item") or []
  109. if not terms:
  110. return None, None
  111. starts = []
  112. ends = []
  113. for t in terms:
  114. sy = t.get("startYear")
  115. ey = t.get("endYear")
  116. if sy:
  117. starts.append(f"{int(sy):04d}-01-03")
  118. if ey:
  119. ends.append(f"{int(ey):04d}-01-03")
  120. served_from = min(starts) if starts else None
  121. served_to = max(ends) if ends and len(ends) == len(starts) else None
  122. return served_from, served_to
  123. def _scan_for_lis(obj):
  124. """Recursively scan obj for any key matching LIS pattern; return string value or None."""
  125. if isinstance(obj, dict):
  126. for k, v in obj.items():
  127. if isinstance(k, str) and LIS_KEY_RE.search(k):
  128. if isinstance(v, str) and re.match(r"^S?\d{3,4}$", v.strip()):
  129. val = v.strip()
  130. if not val.startswith("S"):
  131. val = "S" + val.zfill(3)
  132. return val
  133. found = _scan_for_lis(v)
  134. if found:
  135. return found
  136. elif isinstance(obj, list):
  137. for it in obj:
  138. found = _scan_for_lis(it)
  139. if found:
  140. return found
  141. return None
  142. def _normalize_member(m):
  143. bioguide = (m.get("bioguideId") or "").strip()
  144. if not bioguide:
  145. return None
  146. chamber = _latest_chamber(m)
  147. served_from, served_to = _served_dates(m)
  148. district = m.get("district")
  149. if district is not None:
  150. district = str(district)
  151. name = m.get("directOrderName") or m.get("name") or ""
  152. if not name:
  153. first = m.get("firstName") or ""
  154. last = m.get("lastName") or ""
  155. name = (first + " " + last).strip()
  156. # If name is "Last, First" prefer invertedOrderName? Use as-is otherwise.
  157. if "," in name and not m.get("directOrderName"):
  158. parts = [p.strip() for p in name.split(",", 1)]
  159. if len(parts) == 2:
  160. name = parts[1] + " " + parts[0]
  161. photo = ((m.get("depiction") or {}).get("imageUrl")) or None
  162. return {
  163. "bioguide": bioguide,
  164. "lis": None,
  165. "full_name": name,
  166. "party": _party_letter(m),
  167. "state": _state_code(m),
  168. "district": district if chamber.lower() == "house" else None,
  169. "chamber": chamber,
  170. "served_from": served_from,
  171. "served_to": served_to,
  172. "photo_url": photo,
  173. "source": "congress.gov/v3",
  174. }
  175. def _write_partial(out_dir, directory, lis_map, warnings, note):
  176. out_dir.mkdir(parents=True, exist_ok=True)
  177. (out_dir / "members_directory.json").write_text(
  178. json.dumps(directory, indent=2, sort_keys=True))
  179. (out_dir / "lis_to_bioguide.json").write_text(
  180. json.dumps(lis_map, indent=2, sort_keys=True))
  181. print(f"enrich_roster: WARNING {note}", file=sys.stderr)
  182. def main():
  183. ap = argparse.ArgumentParser(description="Enrich Congress roster via Congress.gov API.")
  184. ap.add_argument("--congress", type=int, default=119)
  185. args = ap.parse_args()
  186. api_key = _load_api_key()
  187. out_dir = DATA_ROOT / str(args.congress)
  188. cache_dir = out_dir / "api_cache"
  189. cache_dir.mkdir(parents=True, exist_ok=True)
  190. warnings = []
  191. directory = {}
  192. # Pass 1: paginate full member list
  193. url = (f"{API_BASE}/member/congress/{args.congress}"
  194. f"?currentMember=false&limit=250&format=json&api_key={api_key}")
  195. page = 0
  196. while url:
  197. page += 1
  198. print(f"enrich_roster: fetching page {page}", file=sys.stderr)
  199. data = _fetch(url, cache_dir, warnings, label=f"page {page}")
  200. if data is None:
  201. _write_partial(out_dir, directory, {}, warnings,
  202. f"pagination failed at page {page}; partial directory written")
  203. print(f"enrich_roster: {len(directory)} members directory written; "
  204. f"0 senators with LIS resolved; {len(warnings)} warnings")
  205. return 0
  206. for m in data.get("members") or []:
  207. norm = _normalize_member(m)
  208. if norm:
  209. directory[norm["bioguide"]] = norm
  210. nxt = ((data.get("pagination") or {}).get("next")) or None
  211. if nxt and "api_key=" not in nxt:
  212. sep = "&" if "?" in nxt else "?"
  213. nxt = f"{nxt}{sep}api_key={api_key}"
  214. url = nxt
  215. # Pass 2: LIS lookup for senators
  216. lis_map = {}
  217. senators = [b for b, e in directory.items() if (e.get("chamber") or "").lower() == "senate"]
  218. print(f"enrich_roster: resolving LIS for {len(senators)} senators", file=sys.stderr)
  219. resolved = 0
  220. for bid in senators:
  221. url = f"{API_BASE}/member/{bid}?format=json&api_key={api_key}"
  222. data = _fetch(url, cache_dir, warnings, label=f"member/{bid}")
  223. if data is None:
  224. warnings.append(f"member/{bid}: fetch failed")
  225. continue
  226. member = (data.get("member") or {})
  227. lis = _scan_for_lis(member)
  228. if lis:
  229. directory[bid]["lis"] = lis
  230. lis_map[lis] = bid
  231. resolved += 1
  232. else:
  233. warnings.append(f"member/{bid}: LIS not derivable from API")
  234. # Fallback: name+state+party match against vote-derived senate roster.
  235. # Why: Congress.gov v3 does not expose LIS reliably; name match is unique within a Congress.
  236. sen_roster_path = out_dir / "senate" / "roster.json"
  237. if sen_roster_path.exists():
  238. def _norm(s):
  239. return ''.join(c for c in unicodedata.normalize('NFKD', s or '') if not unicodedata.combining(c))
  240. def _last(n):
  241. p = re.sub(r'[.,]', '', _norm(n)).split()
  242. return p[-1].lower() if p else ''
  243. vote_sen = json.loads(sen_roster_path.read_text())
  244. idx = {}
  245. for bg in senators:
  246. e = directory[bg]
  247. idx.setdefault((_last(e.get("full_name", "")), e.get("state", ""), e.get("party", "")), []).append(bg)
  248. for lis_key, v in vote_sen.items():
  249. if re.match(r'^[A-Z]\d{6}$', lis_key):
  250. continue
  251. if lis_key in lis_map:
  252. continue
  253. k = (_last(v.get("name", "")), v.get("state", ""), v.get("party", ""))
  254. candidates = idx.get(k, [])
  255. if len(candidates) != 1:
  256. candidates = [bg for (l, s, _), bs in idx.items() for bg in bs
  257. if l == k[0] and s == k[1]]
  258. if len(candidates) == 1:
  259. bg = candidates[0]
  260. directory[bg]["lis"] = lis_key
  261. lis_map[lis_key] = bg
  262. resolved += 1
  263. out_dir.mkdir(parents=True, exist_ok=True)
  264. (out_dir / "members_directory.json").write_text(
  265. json.dumps(directory, indent=2, sort_keys=True))
  266. (out_dir / "lis_to_bioguide.json").write_text(
  267. json.dumps(lis_map, indent=2, sort_keys=True))
  268. print(f"enrich_roster: {len(directory)} members directory written; "
  269. f"{resolved} senators with LIS resolved; {len(warnings)} warnings")
  270. for w in warnings[:10]:
  271. print(f" warn: {w}", file=sys.stderr)
  272. return 0
  273. if __name__ == "__main__":
  274. sys.exit(main())