#!/usr/bin/env python3
"""Fetch all Senate roll-call votes from the 119th Congress (sessions 1 + 2)."""
import urllib.request, os, time, sys, re

SESSIONS = {1: 659, 2: 130}  # session: max vote_number (snapshot 2026-05-24)
CACHE = "/home/user/polisci/senate_vote_cache"
os.makedirs(CACHE, exist_ok=True)
UA = "Mozilla/5.0 (research; polisci-analysis)"

def fetch(session, n):
    path = f"{CACHE}/{session}_{n:05d}.xml"
    if os.path.exists(path) and os.path.getsize(path) > 200:
        return True
    url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
           f"vote119{session}/vote_119_{session}_{n:05d}.xml")
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    try:
        with urllib.request.urlopen(req, timeout=30) as r:
            data = r.read()
        with open(path, "wb") as f: f.write(data)
        time.sleep(0.35)
        return True
    except Exception as e:
        print(f"FAIL {session}/{n}: {e}", file=sys.stderr)
        return False

def main():
    total = sum(SESSIONS.values()); done = 0
    for s, m in SESSIONS.items():
        for n in range(1, m+1):
            fetch(s, n)
            done += 1
            if done % 50 == 0:
                print(f"  {done}/{total}", file=sys.stderr)
    print(f"Done: {sum(1 for f in os.listdir(CACHE) if f.endswith('.xml'))} files cached")

if __name__ == "__main__":
    main()