#!/usr/bin/env python3 """Fetch raw roll-call XML for a given Congress + chamber. Usage: python3 fetch.py --congress 119 --chamber house python3 fetch.py --congress 119 --chamber senate python3 fetch.py --congress 119 --chamber both # default Output: data///cache/*.xml (idempotent — only fetches missing). Throttled at 350 ms between network requests. """ import argparse, os, sys, time, urllib.request, re UA = "Mozilla/5.0 (research; polisci-analysis)" THROTTLE_S = 0.35 DATA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def _http(url): req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=30) as r: return r.read() def _save(path, data): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as f: f.write(data) # ---------- House ---------- def house_year_for_congress(congress): """119 -> [2025, 2026]; 120 -> [2027, 2028]; etc.""" first = 1789 + (congress - 1) * 2 return [first, first + 1] def house_max_roll(year): """Scrape the year index page to discover the highest roll number.""" # Try descending block pages until one returns links. for block in (700, 600, 500, 400, 300, 200, 100, 0): try: html = _http(f"https://clerk.house.gov/evs/{year}/ROLL_{block:03d}.asp").decode("utf-8", "ignore") nums = [int(n) for n in re.findall(r"rollnumber=(\d+)", html)] if nums: time.sleep(THROTTLE_S) return max(nums) except Exception: time.sleep(THROTTLE_S) continue return 0 def fetch_house(congress): out_dir = os.path.join(DATA_ROOT, str(congress), "house", "cache") os.makedirs(out_dir, exist_ok=True) years = house_year_for_congress(congress) total_fetched = 0 for year in years: max_roll = house_max_roll(year) if not max_roll: print(f" [house {year}] no votes discovered", file=sys.stderr) continue print(f" [house {year}] up to roll {max_roll}", file=sys.stderr) for roll in range(1, max_roll + 1): path = os.path.join(out_dir, f"{year}_{roll:03d}.xml") if os.path.exists(path) and os.path.getsize(path) > 200: continue url = f"https://clerk.house.gov/evs/{year}/roll{roll:03d}.xml" try: data = _http(url) _save(path, data) total_fetched += 1 time.sleep(THROTTLE_S) except Exception as e: print(f" FAIL {year}/{roll}: {e}", file=sys.stderr) print(f" [house {year}] done", file=sys.stderr) return total_fetched # ---------- Senate ---------- def senate_max_vote(congress, session): """Read vote_menu to discover the highest vote_number.""" url = (f"https://www.senate.gov/legislative/LIS/roll_call_lists/" f"vote_menu_{congress}_{session}.xml") try: data = _http(url).decode("utf-8", "ignore") time.sleep(THROTTLE_S) nums = [int(n) for n in re.findall(r"(\d+)", data)] return max(nums) if nums else 0 except Exception as e: print(f" [senate menu {session}] FAIL: {e}", file=sys.stderr) return 0 def fetch_senate(congress): out_dir = os.path.join(DATA_ROOT, str(congress), "senate", "cache") os.makedirs(out_dir, exist_ok=True) total_fetched = 0 for session in (1, 2): max_v = senate_max_vote(congress, session) if not max_v: print(f" [senate s{session}] no votes discovered", file=sys.stderr) continue print(f" [senate s{session}] up to vote {max_v}", file=sys.stderr) for v in range(1, max_v + 1): path = os.path.join(out_dir, f"{session}_{v:05d}.xml") if os.path.exists(path) and os.path.getsize(path) > 200: continue url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/" f"vote{congress}{session}/vote_{congress}_{session}_{v:05d}.xml") try: data = _http(url) _save(path, data) total_fetched += 1 time.sleep(THROTTLE_S) except Exception as e: print(f" FAIL s{session}/{v}: {e}", file=sys.stderr) print(f" [senate s{session}] done", file=sys.stderr) return total_fetched # ---------- CLI ---------- def main(): ap = argparse.ArgumentParser() ap.add_argument("--congress", type=int, required=True) ap.add_argument("--chamber", choices=["house", "senate", "both"], default="both") args = ap.parse_args() if args.chamber in ("house", "both"): n = fetch_house(args.congress) print(f"House: {n} new files", file=sys.stderr) if args.chamber in ("senate", "both"): n = fetch_senate(args.congress) print(f"Senate: {n} new files", file=sys.stderr) if __name__ == "__main__": main()