| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- #!/usr/bin/env python3
- """Fetch raw roll-call XML for a given Congress + chamber.
- Usage:
- python3 fetch.py --congress 119 --chamber house
- python3 fetch.py --congress 119 --chamber senate
- python3 fetch.py --congress 119 --chamber both # default
- Output: data/<congress>/<chamber>/cache/*.xml (idempotent — only fetches missing).
- Throttled at 350 ms between network requests.
- """
- import argparse, os, sys, time, urllib.request, re
- UA = "Mozilla/5.0 (research; polisci-analysis)"
- THROTTLE_S = 0.35
- DATA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
- def _http(url):
- req = urllib.request.Request(url, headers={"User-Agent": UA})
- with urllib.request.urlopen(req, timeout=30) as r:
- return r.read()
- def _save(path, data):
- os.makedirs(os.path.dirname(path), exist_ok=True)
- with open(path, "wb") as f: f.write(data)
- # ---------- House ----------
- def house_year_for_congress(congress):
- """119 -> [2025, 2026]; 120 -> [2027, 2028]; etc."""
- first = 1789 + (congress - 1) * 2
- return [first, first + 1]
- def house_max_roll(year):
- """Scrape the year index page to discover the highest roll number."""
- # Try descending block pages until one returns links.
- for block in (700, 600, 500, 400, 300, 200, 100, 0):
- try:
- html = _http(f"https://clerk.house.gov/evs/{year}/ROLL_{block:03d}.asp").decode("utf-8", "ignore")
- nums = [int(n) for n in re.findall(r"rollnumber=(\d+)", html)]
- if nums:
- time.sleep(THROTTLE_S)
- return max(nums)
- except Exception:
- time.sleep(THROTTLE_S)
- continue
- return 0
- def fetch_house(congress):
- out_dir = os.path.join(DATA_ROOT, str(congress), "house", "cache")
- os.makedirs(out_dir, exist_ok=True)
- years = house_year_for_congress(congress)
- total_fetched = 0
- for year in years:
- max_roll = house_max_roll(year)
- if not max_roll:
- print(f" [house {year}] no votes discovered", file=sys.stderr)
- continue
- print(f" [house {year}] up to roll {max_roll}", file=sys.stderr)
- for roll in range(1, max_roll + 1):
- path = os.path.join(out_dir, f"{year}_{roll:03d}.xml")
- if os.path.exists(path) and os.path.getsize(path) > 200:
- continue
- url = f"https://clerk.house.gov/evs/{year}/roll{roll:03d}.xml"
- try:
- data = _http(url)
- _save(path, data)
- total_fetched += 1
- time.sleep(THROTTLE_S)
- except Exception as e:
- print(f" FAIL {year}/{roll}: {e}", file=sys.stderr)
- print(f" [house {year}] done", file=sys.stderr)
- return total_fetched
- # ---------- Senate ----------
- def senate_max_vote(congress, session):
- """Read vote_menu to discover the highest vote_number."""
- url = (f"https://www.senate.gov/legislative/LIS/roll_call_lists/"
- f"vote_menu_{congress}_{session}.xml")
- try:
- data = _http(url).decode("utf-8", "ignore")
- time.sleep(THROTTLE_S)
- nums = [int(n) for n in re.findall(r"<vote_number>(\d+)</vote_number>", data)]
- return max(nums) if nums else 0
- except Exception as e:
- print(f" [senate menu {session}] FAIL: {e}", file=sys.stderr)
- return 0
- def fetch_senate(congress):
- out_dir = os.path.join(DATA_ROOT, str(congress), "senate", "cache")
- os.makedirs(out_dir, exist_ok=True)
- total_fetched = 0
- for session in (1, 2):
- max_v = senate_max_vote(congress, session)
- if not max_v:
- print(f" [senate s{session}] no votes discovered", file=sys.stderr)
- continue
- print(f" [senate s{session}] up to vote {max_v}", file=sys.stderr)
- for v in range(1, max_v + 1):
- path = os.path.join(out_dir, f"{session}_{v:05d}.xml")
- if os.path.exists(path) and os.path.getsize(path) > 200:
- continue
- url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
- f"vote{congress}{session}/vote_{congress}_{session}_{v:05d}.xml")
- try:
- data = _http(url)
- _save(path, data)
- total_fetched += 1
- time.sleep(THROTTLE_S)
- except Exception as e:
- print(f" FAIL s{session}/{v}: {e}", file=sys.stderr)
- print(f" [senate s{session}] done", file=sys.stderr)
- return total_fetched
- # ---------- CLI ----------
- def main():
- ap = argparse.ArgumentParser()
- ap.add_argument("--congress", type=int, required=True)
- ap.add_argument("--chamber", choices=["house", "senate", "both"], default="both")
- args = ap.parse_args()
- if args.chamber in ("house", "both"):
- n = fetch_house(args.congress)
- print(f"House: {n} new files", file=sys.stderr)
- if args.chamber in ("senate", "both"):
- n = fetch_senate(args.congress)
- print(f"Senate: {n} new files", file=sys.stderr)
- if __name__ == "__main__":
- main()
|