fetch_senate.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. #!/usr/bin/env python3
  2. """Fetch all Senate roll-call votes from the 119th Congress (sessions 1 + 2)."""
  3. import urllib.request, os, time, sys, re
  4. SESSIONS = {1: 659, 2: 130} # session: max vote_number (snapshot 2026-05-24)
  5. CACHE = "/home/user/polisci/senate_vote_cache"
  6. os.makedirs(CACHE, exist_ok=True)
  7. UA = "Mozilla/5.0 (research; polisci-analysis)"
  8. def fetch(session, n):
  9. path = f"{CACHE}/{session}_{n:05d}.xml"
  10. if os.path.exists(path) and os.path.getsize(path) > 200:
  11. return True
  12. url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
  13. f"vote119{session}/vote_119_{session}_{n:05d}.xml")
  14. req = urllib.request.Request(url, headers={"User-Agent": UA})
  15. try:
  16. with urllib.request.urlopen(req, timeout=30) as r:
  17. data = r.read()
  18. with open(path, "wb") as f: f.write(data)
  19. time.sleep(0.35)
  20. return True
  21. except Exception as e:
  22. print(f"FAIL {session}/{n}: {e}", file=sys.stderr)
  23. return False
  24. def main():
  25. total = sum(SESSIONS.values()); done = 0
  26. for s, m in SESSIONS.items():
  27. for n in range(1, m+1):
  28. fetch(s, n)
  29. done += 1
  30. if done % 50 == 0:
  31. print(f" {done}/{total}", file=sys.stderr)
  32. print(f"Done: {sum(1 for f in os.listdir(CACHE) if f.endswith('.xml'))} files cached")
  33. if __name__ == "__main__":
  34. main()