#!/usr/bin/env python3 """Fetch all Senate roll-call votes from the 119th Congress (sessions 1 + 2).""" import urllib.request, os, time, sys, re SESSIONS = {1: 659, 2: 130} # session: max vote_number (snapshot 2026-05-24) CACHE = "/home/user/polisci/senate_vote_cache" os.makedirs(CACHE, exist_ok=True) UA = "Mozilla/5.0 (research; polisci-analysis)" def fetch(session, n): path = f"{CACHE}/{session}_{n:05d}.xml" if os.path.exists(path) and os.path.getsize(path) > 200: return True url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/" f"vote119{session}/vote_119_{session}_{n:05d}.xml") req = urllib.request.Request(url, headers={"User-Agent": UA}) try: with urllib.request.urlopen(req, timeout=30) as r: data = r.read() with open(path, "wb") as f: f.write(data) time.sleep(0.35) return True except Exception as e: print(f"FAIL {session}/{n}: {e}", file=sys.stderr) return False def main(): total = sum(SESSIONS.values()); done = 0 for s, m in SESSIONS.items(): for n in range(1, m+1): fetch(s, n) done += 1 if done % 50 == 0: print(f" {done}/{total}", file=sys.stderr) print(f"Done: {sum(1 for f in os.listdir(CACHE) if f.endswith('.xml'))} files cached") if __name__ == "__main__": main()