claude
/
congress-analyzer


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
							#!/usr/bin/env python3
"""Fetch raw roll-call XML for a given Congress + chamber.

Usage:
  python3 fetch.py --congress 119 --chamber house
  python3 fetch.py --congress 119 --chamber senate
  python3 fetch.py --congress 119 --chamber both     # default

Output: data/<congress>/<chamber>/cache/*.xml (idempotent — only fetches missing).
Throttled at 350 ms between network requests.
"""
import argparse, os, sys, time, urllib.request, re

UA = "Mozilla/5.0 (research; polisci-analysis)"
THROTTLE_S = 0.35
DATA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

def _http(url):
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=30) as r:
        return r.read()

def _save(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f: f.write(data)

# ---------- House ----------

def house_year_for_congress(congress):
    """119 -> [2025, 2026]; 120 -> [2027, 2028]; etc."""
    first = 1789 + (congress - 1) * 2
    return [first, first + 1]

def house_max_roll(year):
    """Scrape the year index page to discover the highest roll number."""
    # Try descending block pages until one returns links.
    for block in (700, 600, 500, 400, 300, 200, 100, 0):
        try:
            html = _http(f"https://clerk.house.gov/evs/{year}/ROLL_{block:03d}.asp").decode("utf-8", "ignore")
            nums = [int(n) for n in re.findall(r"rollnumber=(\d+)", html)]
            if nums:
                time.sleep(THROTTLE_S)
                return max(nums)
        except Exception:
            time.sleep(THROTTLE_S)
            continue
    return 0

def fetch_house(congress):
    out_dir = os.path.join(DATA_ROOT, str(congress), "house", "cache")
    os.makedirs(out_dir, exist_ok=True)
    years = house_year_for_congress(congress)
    total_fetched = 0
    for year in years:
        max_roll = house_max_roll(year)
        if not max_roll:
            print(f"  [house {year}] no votes discovered", file=sys.stderr)
            continue
        print(f"  [house {year}] up to roll {max_roll}", file=sys.stderr)
        for roll in range(1, max_roll + 1):
            path = os.path.join(out_dir, f"{year}_{roll:03d}.xml")
            if os.path.exists(path) and os.path.getsize(path) > 200:
                continue
            url = f"https://clerk.house.gov/evs/{year}/roll{roll:03d}.xml"
            try:
                data = _http(url)
                _save(path, data)
                total_fetched += 1
                time.sleep(THROTTLE_S)
            except Exception as e:
                print(f"    FAIL {year}/{roll}: {e}", file=sys.stderr)
        print(f"  [house {year}] done", file=sys.stderr)
    return total_fetched

# ---------- Senate ----------

def senate_max_vote(congress, session):
    """Read vote_menu to discover the highest vote_number."""
    url = (f"https://www.senate.gov/legislative/LIS/roll_call_lists/"
           f"vote_menu_{congress}_{session}.xml")
    try:
        data = _http(url).decode("utf-8", "ignore")
        time.sleep(THROTTLE_S)
        nums = [int(n) for n in re.findall(r"<vote_number>(\d+)</vote_number>", data)]
        return max(nums) if nums else 0
    except Exception as e:
        print(f"  [senate menu {session}] FAIL: {e}", file=sys.stderr)
        return 0

def fetch_senate(congress):
    out_dir = os.path.join(DATA_ROOT, str(congress), "senate", "cache")
    os.makedirs(out_dir, exist_ok=True)
    total_fetched = 0
    for session in (1, 2):
        max_v = senate_max_vote(congress, session)
        if not max_v:
            print(f"  [senate s{session}] no votes discovered", file=sys.stderr)
            continue
        print(f"  [senate s{session}] up to vote {max_v}", file=sys.stderr)
        for v in range(1, max_v + 1):
            path = os.path.join(out_dir, f"{session}_{v:05d}.xml")
            if os.path.exists(path) and os.path.getsize(path) > 200:
                continue
            url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
                   f"vote{congress}{session}/vote_{congress}_{session}_{v:05d}.xml")
            try:
                data = _http(url)
                _save(path, data)
                total_fetched += 1
                time.sleep(THROTTLE_S)
            except Exception as e:
                print(f"    FAIL s{session}/{v}: {e}", file=sys.stderr)
        print(f"  [senate s{session}] done", file=sys.stderr)
    return total_fetched

# ---------- CLI ----------

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--congress", type=int, required=True)
    ap.add_argument("--chamber", choices=["house", "senate", "both"], default="both")
    args = ap.parse_args()
    if args.chamber in ("house", "both"):
        n = fetch_house(args.congress)
        print(f"House: {n} new files", file=sys.stderr)
    if args.chamber in ("senate", "both"):
        n = fetch_senate(args.congress)
        print(f"Senate: {n} new files", file=sys.stderr)

if __name__ == "__main__":
    main()