| 1234567891011121314151617181920212223242526272829303132333435363738 |
- #!/usr/bin/env python3
- """Fetch all Senate roll-call votes from the 119th Congress (sessions 1 + 2)."""
- import urllib.request, os, time, sys, re
- SESSIONS = {1: 659, 2: 130} # session: max vote_number (snapshot 2026-05-24)
- CACHE = "/home/user/polisci/senate_vote_cache"
- os.makedirs(CACHE, exist_ok=True)
- UA = "Mozilla/5.0 (research; polisci-analysis)"
- def fetch(session, n):
- path = f"{CACHE}/{session}_{n:05d}.xml"
- if os.path.exists(path) and os.path.getsize(path) > 200:
- return True
- url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
- f"vote119{session}/vote_119_{session}_{n:05d}.xml")
- req = urllib.request.Request(url, headers={"User-Agent": UA})
- try:
- with urllib.request.urlopen(req, timeout=30) as r:
- data = r.read()
- with open(path, "wb") as f: f.write(data)
- time.sleep(0.35)
- return True
- except Exception as e:
- print(f"FAIL {session}/{n}: {e}", file=sys.stderr)
- return False
- def main():
- total = sum(SESSIONS.values()); done = 0
- for s, m in SESSIONS.items():
- for n in range(1, m+1):
- fetch(s, n)
- done += 1
- if done % 50 == 0:
- print(f" {done}/{total}", file=sys.stderr)
- print(f"Done: {sum(1 for f in os.listdir(CACHE) if f.endswith('.xml'))} files cached")
- if __name__ == "__main__":
- main()
|