parse.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #!/usr/bin/env python3
  2. """Parse cached XML rollcalls into a unified, chamber-agnostic schema.
  3. Outputs per (congress, chamber):
  4. data/<congress>/<chamber>/votes.jsonl — one JSON object per vote, schema:
  5. {
  6. "chamber": "house"|"senate",
  7. "year": 2025, "session": 1, "num": 47,
  8. "date": "2025-01-09",
  9. "bill": "H R 1234" | "S 5" | "",
  10. "question": "On Passage",
  11. "result": "Passed",
  12. "desc": "Short title",
  13. "totals": {"R":{"yea":0,"nay":0,"present":0,"nv":0}, "D":{...}, "I":{...}},
  14. "votes": {"M001184": "Yea", "K000389": "Nay", ...} # id -> raw vote text
  15. }
  16. data/<congress>/<chamber>/roster.json — {id: {name, party, state, chamber}}
  17. Run:
  18. python3 parse.py --congress 119 --chamber {house|senate|both}
  19. """
  20. import argparse, os, sys, json, glob, re
  21. import xml.etree.ElementTree as ET
  22. DATA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
  23. MONTHS_3 = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
  24. MONTHS_FULL = ["January","February","March","April","May","June",
  25. "July","August","September","October","November","December"]
  26. # Independents who caucus with Democrats — counted as D for party totals.
  27. DEM_CAUCUSING_INDEPENDENTS = {
  28. # Bernie Sanders (S313 in Senate XML), Angus King (S354), etc.
  29. "S313", "S354",
  30. }
  31. # Reject characters that should never appear in government XML free-text fields.
  32. # Presence indicates injection, encoding corruption, or upstream tampering.
  33. _BAD_TEXT_RE = re.compile(r"[<>\x00-\x08\x0b\x0c\x0e-\x1f]")
  34. def _safe_text(s, field, source):
  35. """Return s unchanged if clean; raise ValueError with context if it contains
  36. angle brackets or ASCII control chars (excluding \\t, \\n, \\r)."""
  37. if s and _BAD_TEXT_RE.search(s):
  38. raise ValueError(
  39. f"Unsafe characters in upstream field '{field}' from {source}: {s!r}")
  40. return s
  41. # ---------- House parsing ----------
  42. def _t(el, tag):
  43. sub = el.find(tag)
  44. return (sub.text or "").strip() if sub is not None and sub.text else ""
  45. def _ts(el, tag, source):
  46. """Like _t but validates the extracted text via _safe_text."""
  47. return _safe_text(_t(el, tag), tag, source)
  48. def parse_house_vote(path, year, roll):
  49. try:
  50. with open(path, "rb") as f:
  51. root = ET.fromstring(f.read())
  52. except Exception:
  53. return None, {}
  54. meta = root.find("vote-metadata")
  55. if meta is None: return None, {}
  56. # Date: "3-Jan-2025"
  57. raw = _t(meta, "action-date")
  58. iso = None
  59. try:
  60. d, mo, y = raw.split("-")
  61. iso = f"{y}-{MONTHS_3.index(mo)+1:02d}-{int(d):02d}"
  62. except Exception:
  63. pass
  64. totals = {"R":{"yea":0,"nay":0,"present":0,"nv":0},
  65. "D":{"yea":0,"nay":0,"present":0,"nv":0},
  66. "I":{"yea":0,"nay":0,"present":0,"nv":0}}
  67. for pt in meta.findall("vote-totals/totals-by-party"):
  68. party = (pt.findtext("party","") or "").strip()
  69. key = ("R" if party=="Republican" else "D" if party=="Democratic" else "I")
  70. totals[key] = {
  71. "yea": int(pt.findtext("yea-total","0") or 0),
  72. "nay": int(pt.findtext("nay-total","0") or 0),
  73. "present": int(pt.findtext("present-total","0") or 0),
  74. "nv": int(pt.findtext("not-voting-total","0") or 0),
  75. }
  76. votes = {}
  77. roster_seen = {}
  78. source = f"house:{year}/{roll}"
  79. for rv in root.iter("recorded-vote"):
  80. leg = rv.find("legislator")
  81. if leg is None: continue
  82. bid = leg.get("name-id")
  83. if not bid: continue
  84. vote_el = rv.find("vote")
  85. vote = (vote_el.text or "").strip() if vote_el is not None else ""
  86. votes[bid] = _safe_text(vote, "vote", source)
  87. # Roster: store name + party + state (last write wins; fine for stable identities)
  88. roster_seen[bid] = {
  89. "name": _safe_text((leg.get("unaccented-name") or leg.text or "").strip(),
  90. "legislator-name", source),
  91. "party": leg.get("party") or "",
  92. "state": leg.get("state") or "",
  93. "chamber": "house",
  94. }
  95. record = {
  96. "chamber": "house",
  97. "year": year, "session": None, "num": roll,
  98. "date": iso, "date_raw": raw,
  99. "bill": _ts(meta, "legis-num", source),
  100. "question": _ts(meta, "vote-question", source),
  101. "result": _ts(meta, "vote-result", source),
  102. "desc": _ts(meta, "vote-desc", source),
  103. "totals": totals,
  104. "votes": votes,
  105. }
  106. return record, roster_seen
  107. # ---------- Senate parsing ----------
  108. def parse_senate_vote(path, session, vnum):
  109. try:
  110. with open(path, "rb") as f:
  111. root = ET.fromstring(f.read())
  112. except Exception:
  113. return None, {}
  114. raw = _t(root, "vote_date")
  115. iso = None
  116. m = re.match(r"\s*(\w+)\s+(\d+),\s+(\d+)", raw)
  117. if m:
  118. mon, day, yr = m.groups()
  119. try:
  120. mi = MONTHS_FULL.index(mon) + 1
  121. iso = f"{yr}-{mi:02d}-{int(day):02d}"
  122. except Exception:
  123. pass
  124. year_int = int(iso[:4]) if iso else None
  125. totals = {"R":{"yea":0,"nay":0,"present":0,"nv":0},
  126. "D":{"yea":0,"nay":0,"present":0,"nv":0},
  127. "I":{"yea":0,"nay":0,"present":0,"nv":0}}
  128. votes = {}
  129. roster_seen = {}
  130. source = f"senate:{session}/{vnum}"
  131. for mem in root.iter("member"):
  132. lid = (mem.findtext("lis_member_id","") or "").strip()
  133. if not lid: continue
  134. party = (mem.findtext("party","") or "").strip()
  135. vc = (mem.findtext("vote_cast","") or "").strip()
  136. votes[lid] = _safe_text(vc, "vote_cast", source)
  137. name = ((mem.findtext("first_name","") or "").strip() + " " +
  138. (mem.findtext("last_name","") or "").strip()).strip()
  139. roster_seen[lid] = {
  140. "name": _safe_text(name, "member-name", source),
  141. "party": party,
  142. "state": (mem.findtext("state","") or "").strip(),
  143. "chamber": "senate",
  144. }
  145. bucket = ("yea" if vc=="Yea" else "nay" if vc=="Nay"
  146. else "present" if vc=="Present" else "nv")
  147. # Count for party totals — Dem-caucusing Independents go into D.
  148. if party == "R":
  149. totals["R"][bucket] += 1
  150. elif party == "D" or lid in DEM_CAUCUSING_INDEPENDENTS:
  151. totals["D"][bucket] += 1
  152. else:
  153. totals["I"][bucket] += 1
  154. # Compose bill identifier
  155. dt = root.findtext("document/document_type","") or ""
  156. dn = root.findtext("document/document_number","") or ""
  157. bill = _safe_text((dt + " " + dn).strip(), "bill", source)
  158. record = {
  159. "chamber": "senate",
  160. "year": year_int, "session": session, "num": vnum,
  161. "date": iso, "date_raw": raw,
  162. "bill": bill,
  163. "question": _ts(root, "question", source) or _ts(root, "vote_question_text", source),
  164. "result": _ts(root, "vote_result", source),
  165. "desc": _ts(root, "vote_title", source) or _ts(root, "vote_document_text", source),
  166. "totals": totals,
  167. "votes": votes,
  168. }
  169. return record, roster_seen
  170. # ---------- Orchestration ----------
  171. def parse_chamber(congress, chamber):
  172. cache = os.path.join(DATA_ROOT, str(congress), chamber, "cache")
  173. out_votes = os.path.join(DATA_ROOT, str(congress), chamber, "votes.jsonl")
  174. out_roster = os.path.join(DATA_ROOT, str(congress), chamber, "roster.json")
  175. if not os.path.isdir(cache):
  176. print(f" no cache at {cache}", file=sys.stderr); return 0
  177. roster = {}
  178. n = 0
  179. with open(out_votes, "w") as out:
  180. for path in sorted(glob.glob(os.path.join(cache, "*.xml"))):
  181. base = os.path.splitext(os.path.basename(path))[0]
  182. try:
  183. a, b = base.split("_")
  184. a, b = int(a), int(b)
  185. except Exception:
  186. continue
  187. if chamber == "house":
  188. rec, seen = parse_house_vote(path, a, b)
  189. else:
  190. rec, seen = parse_senate_vote(path, a, b)
  191. if not rec: continue
  192. out.write(json.dumps(rec, separators=(",", ":")) + "\n")
  193. n += 1
  194. for mid, info in seen.items():
  195. # Don't overwrite a roster entry with a less-complete one
  196. if mid not in roster or not roster[mid].get("name"):
  197. roster[mid] = info
  198. else:
  199. # Update party/state in case of a switch — keep latest
  200. if info.get("party"): roster[mid]["party"] = info["party"]
  201. if info.get("state"): roster[mid]["state"] = info["state"]
  202. # Merge with members_directory.json if available (Phase 0.5 enrichment).
  203. vote_count = len(roster)
  204. directory_only = 0
  205. directory_path = os.path.join(DATA_ROOT, str(congress), "members_directory.json")
  206. lis_xwalk_path = os.path.join(DATA_ROOT, str(congress), "lis_to_bioguide.json")
  207. directory_present = os.path.isfile(directory_path)
  208. if directory_present:
  209. try:
  210. with open(directory_path) as f:
  211. directory = json.load(f)
  212. except Exception as e:
  213. print(f" [{chamber}] WARNING: could not load {directory_path}: {e}",
  214. file=sys.stderr)
  215. directory = {}
  216. # For senate, map bioguide -> lis via reverse of lis_to_bioguide.json.
  217. bioguide_to_lis = {}
  218. if chamber == "senate" and os.path.isfile(lis_xwalk_path):
  219. try:
  220. with open(lis_xwalk_path) as f:
  221. lis_map = json.load(f)
  222. bioguide_to_lis = {b: l for l, b in lis_map.items()}
  223. except Exception as e:
  224. print(f" [{chamber}] WARNING: could not load {lis_xwalk_path}: {e}",
  225. file=sys.stderr)
  226. for bioguide, entry in directory.items():
  227. ec = (entry.get("chamber") or "").lower()
  228. entry_chamber = "senate" if "senate" in ec else ("house" if "house" in ec else ec)
  229. if entry_chamber != chamber:
  230. continue
  231. # Roster key: bioguide for house, LIS for senate.
  232. if chamber == "house":
  233. key = bioguide
  234. else:
  235. key = bioguide_to_lis.get(bioguide)
  236. if not key:
  237. # Without LIS we cannot map to the per-chamber roster key;
  238. # still add under bioguide so the directory entry isn't lost.
  239. key = bioguide
  240. enrichment = {
  241. "full_name": entry.get("full_name"),
  242. "district": entry.get("district"),
  243. "served_from": entry.get("served_from"),
  244. "served_to": entry.get("served_to"),
  245. "photo_url": entry.get("photo_url"),
  246. "bioguide": bioguide,
  247. "lis": entry.get("lis"),
  248. }
  249. # Overwrite vote-derived state with directory state — vote XMLs
  250. # report "XX" for territorial delegates (AS/DC/GU/MP/PR/VI).
  251. dir_state = entry.get("state")
  252. if key in roster:
  253. if dir_state and roster[key].get("state") in (None, "", "XX"):
  254. roster[key]["state"] = dir_state
  255. roster[key].update({k: v for k, v in enrichment.items() if v is not None})
  256. roster[key]["served_partial"] = False
  257. else:
  258. roster[key] = {
  259. "name": entry.get("full_name") or "",
  260. "party": entry.get("party") or "",
  261. "state": entry.get("state") or "",
  262. "chamber": chamber,
  263. "served_partial": True,
  264. **{k: v for k, v in enrichment.items() if v is not None},
  265. }
  266. directory_only += 1
  267. total = len(roster)
  268. print(f" [{chamber}] merged: {vote_count} vote-derived + "
  269. f"{directory_only} directory-only = {total} total roster entries",
  270. file=sys.stderr)
  271. else:
  272. print(f" [{chamber}] merged: {vote_count} vote-derived + 0 directory-only "
  273. f"= {vote_count} total roster entries (no members_directory.json)",
  274. file=sys.stderr)
  275. with open(out_roster, "w") as f:
  276. json.dump(roster, f, indent=2, sort_keys=True)
  277. print(f" [{chamber}] parsed {n} votes, {len(roster)} roster entries → {out_votes}", file=sys.stderr)
  278. return n
  279. def main():
  280. ap = argparse.ArgumentParser()
  281. ap.add_argument("--congress", type=int, required=True)
  282. ap.add_argument("--chamber", choices=["house","senate","both"], default="both")
  283. args = ap.parse_args()
  284. if args.chamber in ("house","both"): parse_chamber(args.congress, "house")
  285. if args.chamber in ("senate","both"): parse_chamber(args.congress, "senate")
  286. # Validation gate: total combined unique-member count across chambers.
  287. directory_path = os.path.join(DATA_ROOT, str(args.congress), "members_directory.json")
  288. if not os.path.isfile(directory_path):
  289. print(f"WARNING: {directory_path} missing — roster may be incomplete "
  290. f"(Phase 0.5 enrichment was skipped). Run enrich_roster.py.",
  291. file=sys.stderr)
  292. if args.chamber == "both":
  293. seen = set()
  294. for ch in ("house", "senate"):
  295. rp = os.path.join(DATA_ROOT, str(args.congress), ch, "roster.json")
  296. if not os.path.isfile(rp): continue
  297. try:
  298. with open(rp) as f:
  299. r = json.load(f)
  300. for k, v in r.items():
  301. bid = v.get("bioguide") or k
  302. seen.add(bid)
  303. except Exception:
  304. pass
  305. if len(seen) < 535:
  306. print(f"WARNING: combined roster has only {len(seen)} unique members "
  307. f"(<535 expected). Congress.gov API may be down or "
  308. f"enrich_roster.py was not run.", file=sys.stderr)
  309. if __name__ == "__main__":
  310. main()