fetch.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #!/usr/bin/env python3
  2. """Fetch raw roll-call XML for a given Congress + chamber.
  3. Usage:
  4. python3 fetch.py --congress 119 --chamber house
  5. python3 fetch.py --congress 119 --chamber senate
  6. python3 fetch.py --congress 119 --chamber both # default
  7. Output: data/<congress>/<chamber>/cache/*.xml (idempotent — only fetches missing).
  8. Throttled at 350 ms between network requests.
  9. """
  10. import argparse, os, sys, time, urllib.request, re
  11. UA = "Mozilla/5.0 (research; polisci-analysis)"
  12. THROTTLE_S = 0.35
  13. DATA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
  14. def _http(url):
  15. req = urllib.request.Request(url, headers={"User-Agent": UA})
  16. with urllib.request.urlopen(req, timeout=30) as r:
  17. return r.read()
  18. def _save(path, data):
  19. os.makedirs(os.path.dirname(path), exist_ok=True)
  20. with open(path, "wb") as f: f.write(data)
  21. # ---------- House ----------
  22. def house_year_for_congress(congress):
  23. """119 -> [2025, 2026]; 120 -> [2027, 2028]; etc."""
  24. first = 1789 + (congress - 1) * 2
  25. return [first, first + 1]
  26. def house_max_roll(year):
  27. """Scrape the year index page to discover the highest roll number."""
  28. # Try descending block pages until one returns links.
  29. for block in (700, 600, 500, 400, 300, 200, 100, 0):
  30. try:
  31. html = _http(f"https://clerk.house.gov/evs/{year}/ROLL_{block:03d}.asp").decode("utf-8", "ignore")
  32. nums = [int(n) for n in re.findall(r"rollnumber=(\d+)", html)]
  33. if nums:
  34. time.sleep(THROTTLE_S)
  35. return max(nums)
  36. except Exception:
  37. time.sleep(THROTTLE_S)
  38. continue
  39. return 0
  40. def fetch_house(congress):
  41. out_dir = os.path.join(DATA_ROOT, str(congress), "house", "cache")
  42. os.makedirs(out_dir, exist_ok=True)
  43. years = house_year_for_congress(congress)
  44. total_fetched = 0
  45. for year in years:
  46. max_roll = house_max_roll(year)
  47. if not max_roll:
  48. print(f" [house {year}] no votes discovered", file=sys.stderr)
  49. continue
  50. print(f" [house {year}] up to roll {max_roll}", file=sys.stderr)
  51. for roll in range(1, max_roll + 1):
  52. path = os.path.join(out_dir, f"{year}_{roll:03d}.xml")
  53. if os.path.exists(path) and os.path.getsize(path) > 200:
  54. continue
  55. url = f"https://clerk.house.gov/evs/{year}/roll{roll:03d}.xml"
  56. try:
  57. data = _http(url)
  58. _save(path, data)
  59. total_fetched += 1
  60. time.sleep(THROTTLE_S)
  61. except Exception as e:
  62. print(f" FAIL {year}/{roll}: {e}", file=sys.stderr)
  63. print(f" [house {year}] done", file=sys.stderr)
  64. return total_fetched
  65. # ---------- Senate ----------
  66. def senate_max_vote(congress, session):
  67. """Read vote_menu to discover the highest vote_number."""
  68. url = (f"https://www.senate.gov/legislative/LIS/roll_call_lists/"
  69. f"vote_menu_{congress}_{session}.xml")
  70. try:
  71. data = _http(url).decode("utf-8", "ignore")
  72. time.sleep(THROTTLE_S)
  73. nums = [int(n) for n in re.findall(r"<vote_number>(\d+)</vote_number>", data)]
  74. return max(nums) if nums else 0
  75. except Exception as e:
  76. print(f" [senate menu {session}] FAIL: {e}", file=sys.stderr)
  77. return 0
  78. def fetch_senate(congress):
  79. out_dir = os.path.join(DATA_ROOT, str(congress), "senate", "cache")
  80. os.makedirs(out_dir, exist_ok=True)
  81. total_fetched = 0
  82. for session in (1, 2):
  83. max_v = senate_max_vote(congress, session)
  84. if not max_v:
  85. print(f" [senate s{session}] no votes discovered", file=sys.stderr)
  86. continue
  87. print(f" [senate s{session}] up to vote {max_v}", file=sys.stderr)
  88. for v in range(1, max_v + 1):
  89. path = os.path.join(out_dir, f"{session}_{v:05d}.xml")
  90. if os.path.exists(path) and os.path.getsize(path) > 200:
  91. continue
  92. url = (f"https://www.senate.gov/legislative/LIS/roll_call_votes/"
  93. f"vote{congress}{session}/vote_{congress}_{session}_{v:05d}.xml")
  94. try:
  95. data = _http(url)
  96. _save(path, data)
  97. total_fetched += 1
  98. time.sleep(THROTTLE_S)
  99. except Exception as e:
  100. print(f" FAIL s{session}/{v}: {e}", file=sys.stderr)
  101. print(f" [senate s{session}] done", file=sys.stderr)
  102. return total_fetched
  103. # ---------- CLI ----------
  104. def main():
  105. ap = argparse.ArgumentParser()
  106. ap.add_argument("--congress", type=int, required=True)
  107. ap.add_argument("--chamber", choices=["house", "senate", "both"], default="both")
  108. args = ap.parse_args()
  109. if args.chamber in ("house", "both"):
  110. n = fetch_house(args.congress)
  111. print(f"House: {n} new files", file=sys.stderr)
  112. if args.chamber in ("senate", "both"):
  113. n = fetch_senate(args.congress)
  114. print(f"Senate: {n} new files", file=sys.stderr)
  115. if __name__ == "__main__":
  116. main()